Setup

load sero-epi functions

source("seroepi_functions.R")
source("https://raw.githubusercontent.com/klebgenomics/KleborateR/main/kleborate_functions.R")
## Loading required package: fs
## Loading required package: glue

set colour palettes

# source: https://davidmathlogic.com/colorblind/#%23D81B60-%231E88E5-%23FFC107-%23004D40
region_cols <- c(`Southern Asia` ="#D81B60", `Western Africa`="#1E88E5", `Eastern Africa`="#FFC107", `Southern Africa`="#81B1A9", Global="black")
group_cols <- c(All="black", Fatal ="#D81B60", ESBL="#1E88E5", CP="#FFC107")

load data on included samples - Table S3

data_NNS_sites10 <- read_tsv("tables/TableS3.tsv")
## Rows: 1877 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (14): Genome Name, Study, Region, Country, Cluster, ST, K_locus, K_type,...
## dbl  (7): Site, Year, Mortality, N50, contig_count, total_size, resistance_s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

K locus analysis

read posterior estimates for adjusted counts (main) and raw counts (for comparison)

# read adjusted Bayesian estimates for K
kbayes <- parseModelledEstimates(global_post="outputs_core/K_Full_min10_adj_28_posterior_global.csv.gz", region_post="outputs_core/K_Full_min10_adj_28_posterior_subgroup.csv.gz")
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
# read raw Bayesian estimates for K
kbayes_raw <- parseModelledEstimates(global_post="outputs_core/K_Full_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/K_Full_min10_raw_28_posterior_subgroup.csv.gz")
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.

global and regional prevalence - top20 K

k_prev_global_dist <- locus_ridgesplot(posterior=kbayes$global_post,
                                   ranks=kbayes$locus_rank, 
                                   lines_every=10, 
                                   xlim=c(0,12), xbreaks=seq(0,12,1), scale=1.5,
                                   title = "a) Global estimates")
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0729
## Warning: Removed 50 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

k_prev_region_heatmap <- region_heatmap(kbayes$region_est, kbayes$global_est, 
                                   ranks=kbayes$locus_rank, median=F, title="b) Point estimates")
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_text()`).

fig 2

k_prev_global_dist + k_prev_region_heatmap
## Picking joint bandwidth of 0.0729
## Warning: Removed 50 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_text()`).

ggsave("figures/Fig2_K_global_regional_Kadj.pdf", width=8, height=6)
## Picking joint bandwidth of 0.0729
## Warning: Removed 50 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_text()`).
ggsave("figures/Fig2_K_global_regional_Kadj.png", width=8, height=6)
## Picking joint bandwidth of 0.0729
## Warning: Removed 50 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_text()`).

global and regional prevalence - all K

k_prev_global_dist_all <- locus_ridgesplot(posterior=kbayes$global_post,
                                   ranks=kbayes$locus_rank,
                                   maxRank = 90, axis_font_size=8,
                                   lines_every=10, 
                                   xlim=c(0,12), xbreaks=seq(0,12,1), scale=3, title="a) Global estimates")
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0404
## Warning: Removed 50 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

k_prev_region_heatmap_all <- region_heatmap(kbayes$region_est, kbayes$global_est,
                                   ranks=kbayes$locus_rank,
                                   maxRank = 90, axis_font_size=8, median=F, title="b) Point estimates")

Figure S2 - modelled estimates for all K loci

k_prev_global_dist_all + k_prev_region_heatmap_all
## Picking joint bandwidth of 0.0404
## Warning: Removed 50 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

ggsave("figures/FigS2_K_global_regional_Kadj.pdf", width=8, height=10)
## Picking joint bandwidth of 0.0404
## Warning: Removed 50 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
ggsave("figures/FigS2_K_global_regional_Kadj.png", width=8, height=10)
## Picking joint bandwidth of 0.0404
## Warning: Removed 50 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

compare global prevalence distributions using cluster-adjusted and raw counts

# plot raw vs adjusted distributions, overlaid - for appendix s2.4b
k_dist_raw_adj <- comparative_locus_ridgesplot(posterior1=kbayes$global_post,
                             posterior2=kbayes_raw$global_post,
                             ranks=kbayes$locus_rank, 
                             lines_every=10, xlim=c(0,20), xbreaks=seq(0,20,1))
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0595
## Warning: Removed 162 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

logistic regression of K vs year + region

result <- c()
for (k in kbayes$locus_rank$locus[1:30]) {
  d <- data_NNS_sites10 %>%
    mutate(k=if_else(K_locus==k, 1, 0)) %>%
    select(k, Year, Region)
  m <- summary(logistf(k~Year+Region, data=d))
  result <- rbind(result, cbind(coef=m$coefficients, p=m$prob, locus=k))
}
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)    lower 0.95  upper 0.95
## (Intercept)           -29.08650611 65.03194079 -156.50119345 99.45899194
## Year                    0.01367681  0.03223093   -0.05003595  0.07682232
## RegionSouthern Africa  -1.07492377  0.26894738   -1.63379372 -0.57292847
## RegionSouthern Asia    -2.82549482  0.40638039   -3.72886712 -2.10776850
## RegionWestern Africa   -1.29204824  0.49182954   -2.41248429 -0.43703467
##                            Chisq            p method
## (Intercept)            0.1983579 6.560496e-01      2
## Year                   0.1785592 6.726144e-01      2
## RegionSouthern Africa 19.5146241 9.983246e-06      2
## RegionSouthern Asia          Inf 0.000000e+00      2
## RegionWestern Africa   9.9111038 1.642846e-03      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=125.8823 on 4 df, p=0, n=1877
## Wald test = 616.8107 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           130.00000000 43.22353251 -370.0000000 630.0000000
## Year                   -0.06458702  0.02142338   -0.3125582   0.1829517
## RegionSouthern Africa  -0.01023999  0.15334746  -10.5650041   2.1760917
## RegionSouthern Asia    -0.12224666  0.11896939  -10.3843446   1.4477985
## RegionWestern Africa   -0.11487808  0.24194477  -25.0351582   4.6747241
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa  2.549452 1.103325e-01      2
## RegionSouthern Asia   44.206672 2.954725e-11      2
## RegionWestern Africa   3.522880 6.052718e-02      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-345.1287 on 4 df, p=1, n=1877
## Wald test = 83.59206 on 4 df, p = 0logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)    lower 0.95  upper 0.95
## (Intercept)           -72.35455265 101.63799482 -272.95985795 130.0138677
## Year                    0.03435712   0.05037043   -0.06594629   0.1337621
## RegionSouthern Africa   1.61971153   0.23776879    1.15508425   2.0919412
## RegionSouthern Asia    -0.83100114   0.35498796   -1.58030718  -0.1700594
## RegionWestern Africa   -0.89610551   0.83527176   -3.08481192   0.4346505
##                           Chisq         p method
## (Intercept)           0.4952034 0.4816153      2
## Year                  0.4546540 0.5001331      2
## RegionSouthern Africa       Inf 0.0000000      2
## RegionSouthern Asia   6.2290525 0.0125671      2
## RegionWestern Africa  1.5142666 0.2184892      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=81.90503 on 4 df, p=1.110223e-16, n=1877
## Wald test = 678.5959 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)    lower 0.95 upper 0.95
## (Intercept)           -121.52670942 73.43279592 -267.12400641 22.3627828
## Year                     0.05879294  0.03639016   -0.01251884  0.1309384
## RegionSouthern Africa   -2.28668794  0.81621578   -4.47188270 -0.9692806
## RegionSouthern Asia      1.64898827  0.18342303    1.29331880  2.0146749
## RegionWestern Africa    -0.52044139  0.65649154   -2.10968342  0.5785972
##                            Chisq            p method
## (Intercept)            2.7368651 9.805696e-02      2
## Year                   2.6075768 1.063542e-01      2
## RegionSouthern Africa 15.9920760 6.360816e-05      2
## RegionSouthern Asia          Inf 0.000000e+00      2
## RegionWestern Africa   0.7277568 3.936113e-01      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=158.4881 on 4 df, p=0, n=1877
## Wald test = 632.3645 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95 upper 0.95
## (Intercept)           -130.00000000 84.31953547 -630.0000000 370.000000
## Year                     0.06304681  0.04178511   -0.1851209   0.310525
## RegionSouthern Africa    0.31769241  0.27165518   -1.8369696   2.554567
## RegionSouthern Asia      0.13270029  0.22680909   -1.7342323   2.090457
## RegionWestern Africa     0.39596654  0.42209128   -4.3975868   3.245775
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa 27.188599 1.845438e-07      2
## RegionSouthern Asia   14.625671 1.311163e-04      2
## RegionWestern Africa   3.373524 6.625189e-02      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-20.39887 on 4 df, p=1, n=1877
## Wald test = 833.0725 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionWestern Africa exceeded. Try to increase the number of iterations by
## passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95 upper 0.95 Chisq p
## (Intercept)           -130.00000000 79.06178898 -630.0000000 370.000000     0 1
## Year                     0.06322928  0.03917991   -0.1848394   0.310769     0 1
## RegionSouthern Africa   -0.15289554  0.27007254   -6.5911993   2.802949     0 1
## RegionSouthern Asia     -0.50137616  0.23177200  -12.3641743   1.784740     0 1
## RegionWestern Africa    -0.46106602  0.50244411  -47.4440673   3.499484     0 1
##                       method
## (Intercept)                2
## Year                       2
## RegionSouthern Africa      2
## RegionSouthern Asia        2
## RegionWestern Africa       2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-60.15645 on 4 df, p=1, n=1877
## Wald test = 841.6061 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef     se(coef)   lower 0.95 upper 0.95
## (Intercept)           -130.00000000 114.96786210 -630.0000000 370.000000
## Year                     0.06261730   0.05697248   -0.1855926   0.310063
## RegionSouthern Africa    0.07047895   0.42790563   -4.5463479   3.314323
## RegionSouthern Asia      0.69972099   0.29094416   -0.7228661   3.838253
## RegionWestern Africa     0.31898132   0.63620254   -8.7612434   4.236022
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa  2.492300 1.144044e-01      2
## RegionSouthern Asia   63.179126 1.887379e-15      2
## RegionWestern Africa   1.256183 2.623749e-01      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-2.263486 on 4 df, p=1, n=1877
## Wald test = 695.9312 on 4 df, p = 0logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           -47.09034721 152.92946154 -353.6408686 262.8169147
## Year                    0.02133827   0.07579197   -0.1322824   0.1732344
## RegionSouthern Africa  -0.21198907   0.60557530   -1.5789092   0.8785118
## RegionSouthern Asia    -0.01005697   0.43669107   -0.9213354   0.8289919
## RegionWestern Africa    1.47348782   0.49872705    0.3935160   2.3969498
##                              Chisq           p method
## (Intercept)           0.0900670734 0.764091903      2
## Year                  0.0753019063 0.783768077      2
## RegionSouthern Africa 0.1262179775 0.722386058      2
## RegionSouthern Asia   0.0005197518 0.981811351      2
## RegionWestern Africa  6.6426376582 0.009956641      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=7.313081 on 4 df, p=0.1202397, n=1877
## Wald test = 531.5882 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia exceeded. Try to increase the number
## of iterations by passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           130.00000000 90.18028308 -370.0000000 630.0000000
## Year                   -0.06573042  0.04470224   -0.3137769   0.1818625
## RegionSouthern Africa  -0.57785009  0.37883740  -43.4911883   1.7012637
## RegionSouthern Asia    -0.47058847  0.26951153  -10.5563668   1.6210080
## RegionWestern Africa   -0.29862098  0.52589872  -20.4370983   3.0991820
##                          Chisq            p method
## (Intercept)            0.00000 1.000000e+00      2
## Year                   0.00000 1.000000e+00      2
## RegionSouthern Africa 49.10809 2.422396e-12      2
## RegionSouthern Asia   56.95319 4.463097e-14      2
## RegionWestern Africa   2.77248 9.589750e-02      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-22.01399 on 4 df, p=1, n=1877
## Wald test = 801.6743 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           -79.86133195 183.12319603 -471.9157131 280.3603917
## Year                    0.03633018   0.09075138   -0.1422319   0.2305735
## RegionSouthern Africa   0.32211716   1.63513706   -4.6710715   3.2790074
## RegionSouthern Asia     3.49186090   0.85080529    2.0875325   5.7081797
## RegionWestern Africa    1.52994867   1.62153311   -3.4567945   4.4700332
##                            Chisq         p method
## (Intercept)           0.17736798 0.6736450      2
## Year                  0.14921118 0.6992903      2
## RegionSouthern Africa 0.03629103 0.8489157      2
## RegionSouthern Asia          Inf 0.0000000      2
## RegionWestern Africa  0.65602285 0.4179675      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=48.01091 on 4 df, p=9.388562e-10, n=1877
## Wald test = 294.0508 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)  lower 0.95  upper 0.95 Chisq p
## (Intercept)           130.00000000 77.84694236 -370.000000 630.0000000     0 1
## Year                   -0.06573852  0.03858862   -0.313879   0.1817258     0 1
## RegionSouthern Africa   0.17156952  0.29129948  -41.297493   3.4329051     0 1
## RegionSouthern Asia     0.60752198  0.20149983   -3.200364  13.4421086     0 1
## RegionWestern Africa    0.06013058  0.45675179  -42.019273  21.3314949     0 1
##                       method
## (Intercept)                2
## Year                       2
## RegionSouthern Africa      2
## RegionSouthern Asia        2
## RegionWestern Africa       2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-123.3719 on 4 df, p=1, n=1877
## Wald test = 839.4505 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef     se(coef)   lower 0.95 upper 0.95
## (Intercept)           -130.00000000 121.88183870 -630.0000000 370.000000
## Year                     0.06259038   0.06039869   -0.1855915   0.310053
## RegionSouthern Africa    0.32720775   0.40567347   -2.6546470   3.452844
## RegionSouthern Asia      0.51769598   0.31154462   -1.1543532   3.460549
## RegionWestern Africa    -0.12299438   0.78568187  -24.6103203   3.779354
##                          Chisq            p method
## (Intercept)            0.00000 1.000000e+00      2
## Year                   0.00000 1.000000e+00      2
## RegionSouthern Africa 13.49821 2.387915e-04      2
## RegionSouthern Asia   38.35544 5.896340e-10      2
## RegionWestern Africa   0.00000 1.000000e+00      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-6.806337 on 4 df, p=1, n=1877
## Wald test = 676.9765 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)   lower 0.95  upper 0.95 Chisq
## (Intercept)           130.00000000 150.22702926 -370.0000000 630.0000000     0
## Year                   -0.06669531   0.07446882   -0.3149471   0.1806365     0
## RegionSouthern Africa   3.31188907   0.37653879    2.6648059  13.4234313   Inf
## RegionSouthern Asia     0.55438541   0.47776614   -2.8049682  10.0039977     0
## RegionWestern Africa    0.41428263   0.95750485  -20.2063161  10.3553847     0
##                       p method
## (Intercept)           1      2
## Year                  1      2
## RegionSouthern Africa 0      2
## RegionSouthern Asia   1      2
## RegionWestern Africa  1      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=153.7819 on 4 df, p=0, n=1877
## Wald test = 474.4761 on 4 df, p = 0logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)            7.420644949 170.28166884 -331.7992812 360.5309195
## Year                  -0.005624995   0.08439765   -0.1806764   0.1624673
## RegionSouthern Africa -0.262067625   0.60790858   -1.6330655   0.8385810
## RegionSouthern Asia   -1.301046960   0.68293132   -2.9327091  -0.1105894
## RegionWestern Africa   0.551000707   0.67643559   -1.0677861   1.7171036
##                             Chisq          p method
## (Intercept)           0.001786984 0.96628128      2
## Year                  0.004181784 0.94843937      2
## RegionSouthern Africa 0.192205246 0.66108751      2
## RegionSouthern Asia   4.687770695 0.03037804      2
## RegionWestern Africa  0.571963019 0.44947992      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=6.446389 on 4 df, p=0.1681998, n=1877
## Wald test = 475.1002 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionWestern Africa exceeded. Try to increase the
## number of iterations by passing 'logistpl.control(maxit=...)' to parameter
## plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95 upper 0.95
## (Intercept)           130.00000000 106.6712283 -370.0000000 630.000000
## Year                   -0.06610104   0.0528774   -0.3142432   0.181374
## RegionSouthern Africa  -0.03381126   0.4330638  -37.0344853   3.484746
## RegionSouthern Asia     0.65278808   0.2731829   -1.7103328   6.371464
## RegionWestern Africa   -0.01048477   0.6523523  -41.4718894   6.287870
##                            Chisq         p method
## (Intercept)           0.00000000 1.0000000      2
## Year                  0.00000000 1.0000000      2
## RegionSouthern Africa 1.86500593 0.1720482      2
## RegionSouthern Asia   0.00000000 1.0000000      2
## RegionWestern Africa  0.05313081 0.8177023      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-30.6719 on 4 df, p=1, n=1877
## Wald test = 733.871 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 104.25616519 -630.0000000 370.0000000
## Year                     0.06287257   0.05166456   -0.1852241   0.3103878
## RegionSouthern Africa   -0.29736973   0.38731630  -10.5316925   2.6850990
## RegionSouthern Asia     -0.20084701   0.29155790   -5.0752118   2.3949611
## RegionWestern Africa     0.28346335   0.51301932   -9.1103198   4.0187789
##                          Chisq         p method
## (Intercept)           0.000000 1.0000000      2
## Year                  0.000000 1.0000000      2
## RegionSouthern Africa 0.000000 1.0000000      2
## RegionSouthern Asia   0.000000 1.0000000      2
## RegionWestern Africa  1.896231 0.1685008      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-33.20581 on 4 df, p=1, n=1877
## Wald test = 754.8831 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia exceeded. Try to increase the number
## of iterations by passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)   lower 0.95  upper 0.95 Chisq
## (Intercept)           130.00000000 139.78944005 -370.0000000 630.0000000     0
## Year                   -0.06634971   0.06929458   -0.3144698   0.1811433     0
## RegionSouthern Africa   0.70937688   0.43512388   -2.0275186   3.3670256     0
## RegionSouthern Asia     0.22517648   0.38552872   -3.3829029   3.8223126     0
## RegionWestern Africa    0.83049534   0.58631166   -4.3707455   5.0709390     0
##                       p method
## (Intercept)           1      2
## Year                  1      2
## RegionSouthern Africa 1      2
## RegionSouthern Asia   1      2
## RegionWestern Africa  1      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-11.41955 on 4 df, p=1, n=1877
## Wald test = 616.4021 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 87.28676591 -630.0000000 370.0000000
## Year                     0.06303238  0.04325552   -0.1851036   0.3105265
## RegionSouthern Africa    0.17034884  0.28748605   -6.9121863   6.4939853
## RegionSouthern Asia      0.02727847  0.23636753   -5.1421317   6.3164135
## RegionWestern Africa     0.11146681  0.47567582  -35.7801722   7.5356595
##                            Chisq            p method
## (Intercept)            0.0000000 1.000000e+00      2
## Year                   0.0000000 1.000000e+00      2
## RegionSouthern Africa 20.0571285 7.516284e-06      2
## RegionSouthern Asia    4.3680614 3.661863e-02      2
## RegionWestern Africa   0.9916866 3.193305e-01      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-85.99555 on 4 df, p=1, n=1877
## Wald test = 827.6309 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 116.25483104 -630.0000000 370.0000000
## Year                     0.06274156   0.05761037   -0.1853605   0.3102526
## RegionSouthern Africa   -0.37017067   0.44900063  -12.9130466   2.5215327
## RegionSouthern Asia     -0.05112558   0.31544622   -3.5529702   2.5578854
## RegionWestern Africa     0.14176805   0.61192393  -11.5104451   3.8271429
##                           Chisq         p method
## (Intercept)           0.0000000 1.0000000      2
## Year                  0.0000000 1.0000000      2
## RegionSouthern Africa 0.0000000 1.0000000      2
## RegionSouthern Asia   0.0000000 1.0000000      2
## RegionWestern Africa  0.5927603 0.4413537      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-23.81169 on 4 df, p=1, n=1877
## Wald test = 699.8399 on 4 df, p = 0logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)   lower 0.95   upper 0.95
## (Intercept)           -52.71973388 185.52307640 -426.8433204 333.55656500
## Year                    0.02407423   0.09194473   -0.1674098   0.20944261
## RegionSouthern Africa   0.52545022   0.50222720   -0.5268000   1.49573070
## RegionSouthern Asia    -1.13524851   0.69351045   -2.7824427   0.08920658
## RegionWestern Africa   -0.87645399   1.42719622   -5.7285319   1.14044931
##                            Chisq          p method
## (Intercept)           0.07441045 0.78502020      2
## Year                  0.06319436 0.80151656      2
## RegionSouthern Africa 1.02102891 0.31227507      2
## RegionSouthern Asia   3.25231813 0.07132252      2
## RegionWestern Africa  0.49146881 0.48327283      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=6.439186 on 4 df, p=0.1686627, n=1877
## Wald test = 448.9562 on 4 df, p = 0logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -28.11944599 201.9950139 -440.1100499 395.8047393
## Year                    0.01178341   0.1001114   -0.1983748   0.2159155
## RegionSouthern Africa  -0.22083563   0.7190015   -1.8948708   1.0733278
## RegionSouthern Asia    -0.57090132   0.6211546   -1.9728448   0.5793660
## RegionWestern Africa   -0.66879246   1.4295508   -5.5241974   1.3677154
##                            Chisq         p method
## (Intercept)           0.01757483 0.8945334      2
## Year                  0.01256792 0.9107388      2
## RegionSouthern Africa 0.09620801 0.7564285      2
## RegionSouthern Asia   0.88351149 0.3472417      2
## RegionWestern Africa  0.26430732 0.6071763      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=1.08986 on 4 df, p=0.8958777, n=1877
## Wald test = 408.0509 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Asia exceeded. Try to increase the number of iterations by
## passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 100.3352891 -630.0000000 370.0000000
## Year                     0.06291015   0.0497216   -0.1851715   0.3104293
## RegionSouthern Africa   -0.34182503   0.3796909  -26.0922336   3.7194771
## RegionSouthern Asia     -0.46700800   0.3043030  -46.5077127   2.8115483
## RegionWestern Africa     1.26287077   0.3538492   -2.9177119   5.9806703
##                          Chisq            p method
## (Intercept)            0.00000 1.000000e+00      2
## Year                   0.00000 1.000000e+00      2
## RegionSouthern Africa  0.00000 1.000000e+00      2
## RegionSouthern Asia    0.00000 1.000000e+00      2
## RegionWestern Africa  27.07174 1.960426e-07      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-25.15816 on 4 df, p=1, n=1877
## Wald test = 752.8781 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           128.06114845 206.4571958 -371.9388515 628.0611485
## Year                   -0.06586887   0.1023425   -0.3139448   0.1817288
## RegionSouthern Africa  -0.33622422   0.9848800   -7.4776358   2.0333023
## RegionSouthern Asia     0.84178862   0.5226748   -0.3603506   2.7330925
## RegionWestern Africa    0.18648490   1.2167938   -6.4871674   2.8714007
##                          Chisq         p method
## (Intercept)           0.000000 1.0000000      2
## Year                  0.000000 1.0000000      2
## RegionSouthern Africa 1.834065 0.1756482      2
## RegionSouthern Asia   1.518230 0.2178876      2
## RegionWestern Africa  0.000000 1.0000000      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=2.829322 on 4 df, p=0.5867819, n=1877
## Wald test = 389.2235 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                              coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           65.93766430 230.1374922 -408.5241645 559.5755926
## Year                  -0.03506353   0.1140730   -0.2798187   0.2000418
## RegionSouthern Africa -1.27443921   1.4698048   -6.1558038   0.9102556
## RegionSouthern Asia   -0.01607097   0.6588362   -1.4796440   1.2487212
## RegionWestern Africa   1.45169819   0.7213820   -0.2283746   2.7645777
##                              Chisq          p method
## (Intercept)           0.0726569591 0.78750682      2
## Year                  0.0836633400 0.77239301      2
## RegionSouthern Africa 1.0660679067 0.30183530      2
## RegionSouthern Asia   0.0005640451 0.98105231      2
## RegionWestern Africa  2.9993111133 0.08329993      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=4.934876 on 4 df, p=0.2940451, n=1877
## Wald test = 334.2816 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 65.85606555 -630.0000000 370.0000000
## Year                     0.06329311  0.03263579   -0.1848979   0.3107595
## RegionSouthern Africa   -0.11499476  0.24976947  -45.0459598 161.1690173
## RegionSouthern Asia      0.60882347  0.16787832   -1.3164568 201.2516544
## RegionWestern Africa     0.03014043  0.39273683  -45.2285140  71.1616140
##                           Chisq         p method
## (Intercept)           0.0000000 1.0000000      2
## Year                  0.0000000 1.0000000      2
## RegionSouthern Africa 0.0000000 1.0000000      2
## RegionSouthern Asia         Inf 0.0000000      2
## RegionWestern Africa  0.4488681 0.5028729      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-142.6155 on 4 df, p=1, n=1877
## Wald test = 804.9451 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionWestern Africa exceeded. Try to increase the
## number of iterations by passing 'logistpl.control(maxit=...)' to parameter
## plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)   lower 0.95  upper 0.95 Chisq
## (Intercept)           130.00000000 117.56608006 -370.0000000 630.0000000     0
## Year                   -0.06620446   0.05827815   -0.3143498   0.1812633     0
## RegionSouthern Africa   0.28571022   0.42925118   -9.0596109   3.6642885     0
## RegionSouthern Asia     0.60788589   0.30353247   -1.8463481   6.4815622     0
## RegionWestern Africa    0.04502722   0.70295288  -39.0777259   6.4664769     0
##                       p method
## (Intercept)           1      2
## Year                  1      2
## RegionSouthern Africa 1      2
## RegionSouthern Asia   1      2
## RegionWestern Africa  1      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-27.49681 on 4 df, p=1, n=1877
## Wald test = 693.7843 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)            3.676462016 253.7950346 -496.3235380 503.6764620
## Year                  -0.005606705   0.1257882   -0.2535066   0.2421305
## RegionSouthern Africa  4.476694830   1.4479516    2.3161664   9.3602639
## RegionSouthern Asia    3.538018747   1.4480523    1.3530294   8.4215874
## RegionWestern Africa   2.642478905   1.9654650   -2.5785030   7.8634935
##                              Chisq            p method
## (Intercept)            0.000174254 9.894678e-01      2
## Year                   0.001646778 9.676303e-01      2
## RegionSouthern Africa 52.550995413 4.192202e-13      2
## RegionSouthern Asia   23.039663156 1.586934e-06      2
## RegionWestern Africa   1.390640286 2.382970e-01      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=30.8761 on 4 df, p=3.244823e-06, n=1877
## Wald test = 266.1445 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia exceeded. Try to increase the number
## of iterations by passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           130.00000000 106.18825133 -370.0000000 630.0000000
## Year                   -0.06586743   0.05263766   -0.3138909   0.1817642
## RegionSouthern Africa  -0.66400028   0.44708640  -12.8294993   1.5521432
## RegionSouthern Asia    -0.91382504   0.35672565  -35.1888289   0.5321249
## RegionWestern Africa   -0.35259113   0.61077977  -11.8880848   2.4237312
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa 38.326438 5.984633e-10      2
## RegionSouthern Asia   68.763252 1.110223e-16      2
## RegionWestern Africa   2.258624 1.328721e-01      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=6.049927 on 4 df, p=0.1954506, n=1877
## Wald test = 723.8049 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           126.55629754 226.4146315 -373.4437025 626.5562975
## Year                   -0.06591441   0.1122354   -0.3143073   0.1814108
## RegionSouthern Africa   1.65124887   1.0630958   -0.6986776   9.0208950
## RegionSouthern Asia     2.79917078   0.7923188    1.6598413  10.1298860
## RegionWestern Africa    1.57874856   1.4763027   -3.7763064   9.1347926
##                            Chisq            p method
## (Intercept)            0.0000000 1.000000e+00      2
## Year                   0.0000000 1.000000e+00      2
## RegionSouthern Africa  2.1108474 1.462583e-01      2
## RegionSouthern Asia   26.4430527 2.714229e-07      2
## RegionWestern Africa   0.7949796 3.725986e-01      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=20.31583 on 4 df, p=0.0004325695, n=1877
## Wald test = 293.245 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)  lower 0.95  upper 0.95 Chisq p
## (Intercept)           -130.00000000 50.16401953 -630.000000 370.0000000     0 1
## Year                     0.06384303  0.02486061   -0.184252   0.3114046     0 1
## RegionSouthern Africa   -0.31535749  0.18131644  -48.550315   5.4002944     0 1
## RegionSouthern Asia     -0.26444716  0.13929372  -30.498215   5.4145905     0 1
## RegionWestern Africa    -0.19376061  0.28818003  -49.093470   5.2043916     0 1
##                       method
## (Intercept)                2
## Year                       2
## RegionSouthern Africa      2
## RegionSouthern Asia        2
## RegionWestern Africa       2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-514.6524 on 4 df, p=1, n=1877
## Wald test = 519.0254 on 4 df, p = 0
logistic_sig <- as_tibble(result, rownames="variable") %>% filter(p<0.05) %>% left_join(kbayes$locus_rank)
## Joining with `by = join_by(locus)`
p<-0.02

region_2pc <- kbayes$region_est %>% select(locus, subgroup, mean) %>% 
  pivot_wider(id_cols=locus, names_from=subgroup, values_from = mean) %>% 
  mutate(d1=abs(`Eastern Africa`-`Southern Africa`),
        d2=abs(`Eastern Africa`-`Western Africa`),
        d3=abs(`Eastern Africa`-`Southern Asia`),
        d4=abs(`Southern Africa`-`Southern Asia`),
        d5=abs(`Southern Africa`-`Western Africa`),
        d6=abs(`Western Africa`-`Southern Asia`)) %>% 
  filter(d1>p | d2>p | d3>p | d4>p | d5>p | d6>p)

Fig S3 - overlapping regional densities - log2 scale

kbayes$region_post %>% 
    left_join(kbayes$locus_rank) %>%
    filter(rank<=30) %>%
    mutate(Region=fct_relevel(subgroup,c("Western Africa", "Southern Africa", "Eastern Africa", "Southern Asia"))) %>%
    ggplot() +
    geom_hline(yintercept=11) +
    geom_hline(yintercept=21) +
    geom_density_ridges(aes(x=log2(prob), y=locus, fill=Region), 
                        scale=1, rel_min_height = 0.01, alpha=0.5, col=NA) + 
    scale_y_discrete(limits=rev(kbayes$locus_rank$locus[1:30])) + 
  scale_x_continuous(limits=c(-13,0)) +
  labs(x="log2(prevalence) per region", y="", fill="Region") + 
  annotate(y=region_2pc$locus, label="*", x=-1, geom="text") + 
  annotate(y=logistic_sig$locus, label="^", x=-0.5, geom="text") +
  scale_fill_manual(values=region_cols) +
  theme_bw()
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.123
## Warning: Removed 2676 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_text()`).

ggsave("figures/FigS3_RegionalKBayesDist.pdf", width=6, height=9)
## Picking joint bandwidth of 0.123
## Warning: Removed 2676 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_text()`).
ggsave("figures/FigS3_RegionalKBayesDist.png", width=6, height=9)
## Picking joint bandwidth of 0.123
## Warning: Removed 2676 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_text()`).

O locus analysis

read posterior estimates for adjusted counts (main) and raw counts (for comparison)

# read adjusted Bayesian estimates for O
obayes <- parseModelledEstimates(global_post ="outputs_core/O_Full_min10_adj_28_posterior_global.csv.gz", region_post = "outputs_core/O_Full_min10_adj_28_posterior_subgroup.csv.gz", fixOnames = T)
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
# read raw Bayesian estimates for O
obayes_raw <- parseModelledEstimates(global_post ="outputs_core/O_Full_min10_raw_28_posterior_global.csv.gz", region_post = "outputs_core/O_Full_min10_raw_28_posterior_subgroup.csv.gz", fixOnames = T)
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.

global and regional prevalence

o_prev_global_dist <- locus_ridgesplot(posterior=obayes$global_post, 
                                   ranks=obayes$locus_rank, 
                                   lines_every=5, maxRank=15, 
                                   xlim=c(0,30), xbreaks=seq(0,30,5), title="a) Global estimates")
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0913
## Warning: Removed 118 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

o_prev_region_heatmap <- region_heatmap(estimates=obayes$region_est, global=obayes$global_est,
                                   ranks=obayes$locus_rank, maxRank=15, median=F, title="b) Point estimates")

# fig 4a
o_prev_global_dist + o_prev_region_heatmap
## Picking joint bandwidth of 0.0913
## Warning: Removed 118 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

compare global prevalence distributions using cluster-adjusted and raw counts

# plot raw vs adjusted distributions, overlaid - for appendix s2.4b
o_dist_raw_adj <- comparative_locus_ridgesplot(posterior1=obayes$global_post,
                             posterior2=obayes_raw$global_post,
                             ranks=obayes$locus_rank, maxRank=10,
                             lines_every=5, xlim=c(0,30), xbreaks=seq(0,30,5))
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.104
## Warning: Removed 118 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

logistic regression of O vs year + region

oresult <- c()
for (o in obayes$locus_rank$locus[1:10]) {
  d <- data_NNS_sites10 %>%
    mutate(o=if_else(O_genotype==o, 1, 0)) %>%
    select(o, Year, Region)
  m <- summary(logistf(o~Year+Region, data=d))
  oresult <- rbind(oresult, cbind(coef=m$coefficients, p=m$prob, locus=o))
}
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                              coef    se(coef)   lower 0.95   upper 0.95
## (Intercept)           55.72275487 48.65687639 -39.59451130 151.36548583
## Year                  -0.02808481  0.02411652  -0.07549096   0.01915739
## RegionSouthern Africa -0.26589918  0.17803759  -0.62185989   0.07728888
## RegionSouthern Asia   -0.43447514  0.14023734  -0.71286922  -0.16247693
## RegionWestern Africa   0.30925325  0.24948359  -0.19237050   0.78975161
##                          Chisq           p method
## (Intercept)           1.311849 0.252060164      2
## Year                  1.356577 0.244131576      2
## RegionSouthern Africa 2.289580 0.130245138      2
## RegionSouthern Asia   9.915732 0.001638719      2
## RegionWestern Africa  1.491167 0.222035428      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=19.76395 on 4 df, p=0.0005559294, n=1877
## Wald test = 420.5418 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 44.74228687 -630.0000000 370.0000000
## Year                     0.06411214  0.02217449   -0.1840187   0.3117042
## RegionSouthern Africa   -0.09329761  0.15613474   -4.1012937   2.4814756
## RegionSouthern Asia     -0.29350871  0.12410411   -4.4543442   1.5381654
## RegionWestern Africa     0.04044815  0.24579937   -8.5537389   4.3857690
##                           Chisq         p method
## (Intercept)           0.0000000 1.0000000      2
## Year                  0.0000000 1.0000000      2
## RegionSouthern Africa 0.0000000 1.0000000      2
## RegionSouthern Asia   0.0000000 1.0000000      2
## RegionWestern Africa  0.8393326 0.3595877      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-190.2889 on 4 df, p=1, n=1877
## Wald test = 202.5556 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef   se(coef)   lower 0.95  upper 0.95    Chisq
## (Intercept)           130.00000000 42.9719403 -370.0000000 630.0000000  0.00000
## Year                   -0.06453396  0.0212985   -0.3124635   0.1830676  0.00000
## RegionSouthern Africa  -0.08209284  0.1526063   -6.2858468   1.9348022 15.91919
## RegionSouthern Asia    -0.20637185  0.1183976   -6.4973577   1.1423861 60.01016
## RegionWestern Africa   -0.20768016  0.2414996  -17.8701351   4.1530075  6.88447
##                                  p method
## (Intercept)           1.000000e+00      2
## Year                  1.000000e+00      2
## RegionSouthern Africa 6.610495e-05      2
## RegionSouthern Asia   9.436896e-15      2
## RegionWestern Africa  8.694785e-03      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-137.7519 on 4 df, p=1, n=1877
## Wald test = 65.01845 on 4 df, p = 2.550182e-13logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)    lower 0.95  upper 0.95
## (Intercept)           -48.26043537 67.94536038 -182.34702833 85.14272175
## Year                    0.02251919  0.03367381   -0.04360084  0.08896741
## RegionSouthern Africa   0.96357207  0.23851992    0.48820153  1.42660033
## RegionSouthern Asia     1.83299227  0.17662010    1.49100642  2.18517899
## RegionWestern Africa   -0.57799418  0.65583704   -2.16592253  0.51819343
##                            Chisq            p method
## (Intercept)            0.5019927 0.4786258406      2
## Year                   0.4449356 0.5047498289      2
## RegionSouthern Africa 15.1209247 0.0001008394      2
## RegionSouthern Asia          Inf 0.0000000000      2
## RegionWestern Africa   0.9168523 0.3383028256      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=147.9369 on 4 df, p=0, n=1877
## Wald test = 666.9666 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Asia, RegionWestern Africa exceeded. Try to increase the number
## of iterations by passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 54.58600970 -630.0000000 370.0000000
## Year                     0.06365555  0.02705153   -0.1844764   0.3111785
## RegionSouthern Africa   -0.19235119  0.19694217  -18.8674877   4.5468653
## RegionSouthern Asia     -0.02903790  0.14843972   -4.6562358   4.4451879
## RegionWestern Africa     0.14718588  0.29376962  -22.7968693   5.4840734
##                          Chisq          p method
## (Intercept)           0.000000 1.00000000      2
## Year                  0.000000 1.00000000      2
## RegionSouthern Africa 0.000000 1.00000000      2
## RegionSouthern Asia   0.000000 1.00000000      2
## RegionWestern Africa  3.733312 0.05333754      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-248.1846 on 4 df, p=1, n=1877
## Wald test = 661.2743 on 4 df, p = 0logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                              coef     se(coef)    lower 0.95   upper 0.95
## (Intercept)           74.29619218 125.88249011 -177.30329704 325.51725457
## Year                  -0.03872959   0.06239698   -0.16327291   0.08596303
## RegionSouthern Africa -0.02298928   0.53886400   -1.21235318   0.95619149
## RegionSouthern Asia    0.85552298   0.32531599    0.20865445   1.50033233
## RegionWestern Africa   1.08745681   0.52935370   -0.08820886   2.04304632
##                             Chisq           p method
## (Intercept)           0.336144039 0.562063457      2
## Year                  0.371791359 0.542029440      2
## RegionSouthern Africa 0.001815434 0.966014093      2
## RegionSouthern Asia   6.661335030 0.009852707      2
## RegionWestern Africa  3.348037206 0.067285199      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=9.603924 on 4 df, p=0.04765508, n=1877
## Wald test = 623.2345 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                              coef     se(coef)   lower 0.95   upper 0.95
## (Intercept)           84.79724993 129.76090576 -173.6805811 345.20609397
## Year                  -0.04395748   0.06432027   -0.1730579   0.08414509
## RegionSouthern Africa  2.36814626   0.31453954    1.7624901   3.00841585
## RegionSouthern Asia    0.35014324   0.37621814   -0.4183043   1.08016764
## RegionWestern Africa  -1.11436778   1.42969126   -5.9629769   0.88233808
##                           Chisq         p method
## (Intercept)           0.4122658 0.5208217      2
## Year                  0.4509773 0.5018712      2
## RegionSouthern Africa       Inf 0.0000000      2
## RegionSouthern Asia   0.8300018 0.3622720      2
## RegionWestern Africa  0.8737337 0.3499237      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=76.57055 on 4 df, p=8.881784e-16, n=1877
## Wald test = 580.9497 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -105.1643788 612.1606066 -605.1643788 394.8356212
## Year                     0.0484608   0.3033649   -0.1994038   0.2961592
## RegionSouthern Africa    1.2066802   1.9147514   -4.2356803   4.9726518
## RegionSouthern Asia      0.6785635   1.8239416   -4.7948089   5.4484336
## RegionWestern Africa     2.5649350   1.7590046   -2.6717788   7.8519283
##                             Chisq         p method
## (Intercept)           0.004445785 0.9468391      2
## Year                  0.003942428 0.9499347      2
## RegionSouthern Africa 0.652709494 0.4191456      2
## RegionSouthern Asia   0.200708690 0.6541494      2
## RegionWestern Africa  1.327986488 0.2491637      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=1.502643 on 4 df, p=0.8261731, n=1877
## Wald test = 99.40515 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia exceeded. Try to increase the number
## of iterations by passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 81.57949839 -630.0000000 370.0000000
## Year                     0.06304831  0.04042727   -0.1851406   0.3105105
## RegionSouthern Africa    0.35632392  0.26744907   -5.3555231  10.7892712
## RegionSouthern Asia      0.35756699  0.21426960   -2.3085569  11.5892249
## RegionWestern Africa     0.47654275  0.40913946  -16.8466748  13.1276275
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa 48.124693 3.999578e-12      2
## RegionSouthern Asia         Inf 0.000000e+00      2
## RegionWestern Africa   6.760224 9.321205e-03      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-76.63685 on 4 df, p=1, n=1877
## Wald test = 834.1605 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Asia exceeded. Try to increase the number of iterations by
## passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 245.9853743 -630.0000000 370.0000000
## Year                     0.06189375   0.1218978   -0.1862013   0.3094002
## RegionSouthern Africa   -0.04477779   0.9096706  -12.9112451   5.7649779
## RegionSouthern Asia      0.13053122   0.6785308   -4.5506945   5.7846867
## RegionWestern Africa     1.36140326   0.8426392   -2.7214082   7.8420568
##                          Chisq          p method
## (Intercept)           0.000000 1.00000000      2
## Year                  0.000000 1.00000000      2
## RegionSouthern Africa 0.000000 1.00000000      2
## RegionSouthern Asia   2.144623 0.14307015      2
## RegionWestern Africa  4.730389 0.02963417      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-2.691265 on 4 df, p=1, n=1877
## Wald test = 329.4833 on 4 df, p = 0
logistic_sig_o <- as_tibble(oresult, rownames="variable") %>% filter(p<0.05) %>% left_join(obayes$locus_rank)
## Joining with `by = join_by(locus)`
p<-0.05

region_5pc_o <- obayes$region_est %>% select(locus, subgroup, mean) %>% 
  pivot_wider(id_cols=locus, names_from=subgroup, values_from = mean) %>% 
  mutate(d1=abs(`Eastern Africa`-`Southern Africa`),
        d2=abs(`Eastern Africa`-`Western Africa`),
        d3=abs(`Eastern Africa`-`Southern Asia`),
        d4=abs(`Southern Africa`-`Southern Asia`),
        d5=abs(`Southern Africa`-`Western Africa`),
        d6=abs(`Western Africa`-`Southern Asia`)) %>% 
  filter(d1>p | d2>p | d3>p | d4>p | d5>p | d6>p)

Fig S5 - overlapping regional densities for O types - log2 scale

obayes$region_post %>% 
    left_join(obayes$locus_rank) %>%
    filter(rank<=10) %>%
    mutate(Region=fct_relevel(subgroup,c("Western Africa", "Southern Africa", "Eastern Africa", "Southern Asia"))) %>%
    ggplot() +
    geom_hline(yintercept=11) +
    geom_hline(yintercept=21) +
    geom_density_ridges(aes(x=log2(prob), y=locus, fill=Region), 
                        scale=1, rel_min_height = 0.01, alpha=0.5, col=NA) + 
    scale_y_discrete(limits=rev(obayes$locus_rank$locus[1:10])) + 
  scale_x_continuous(limits=c(-13,0)) +
  labs(x="log2(prevalence) per region", y="", fill="Region") + 
  annotate(y=region_5pc_o$locus, label="*", x=-1, geom="text") + 
  annotate(y=logistic_sig_o$locus, label="^", x=-0.5, geom="text") +
  scale_fill_manual(values=region_cols) +
  theme_bw()
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0644
## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

ggsave("figures/FigS5_RegionalOBayesDist.pdf", width=6, height=5)
## Picking joint bandwidth of 0.0644
## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
ggsave("figures/FigS5_RegionalOBayesDist.png", width=6, height=5)
## Picking joint bandwidth of 0.0644
## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

Appendix Fig S1.3 = crude annual K rate per region

K_crude_perRegion_perYear <- data_NNS_sites10 %>%
  raw_adj_prop(grouping_vars = c("K_locus", "Region", "Year"), summarise_by = "Region", adj_vars = c("Cluster", "Region", "Year")) 
## grouping_var: Region in adj_vars, removing from adj_vars
## grouping_var: Year in adj_vars, removing from adj_vars
## Grouping vars: K_locus, Region, Year
## Summarising by: Region
## Adj vars: Cluster
K_crude_perRegion_perYear <- K_crude_perRegion_perYear %>% group_by(Region, Year) %>% 
  summarise(n=sum(adj_count)) %>% left_join(K_crude_perRegion_perYear) %>%
  mutate(adj_prop = adj_count/n) %>% select(-raw_sum, -adj_sum, -raw_prop, -raw_count)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(Region, Year)`
K_crude_perRegion_perYear %>% 
    left_join(kbayes$locus_rank %>% rename(K_locus=locus)) %>% 
    filter(rank <=30 | K_locus %in% c("KL116", "KL53", "KL81")) %>%
    ggplot(aes(x=Year, fill=Region, y=adj_prop)) + 
    geom_col() +
    theme_bw() + 
    theme(axis.text.x = element_text(angle=45, hjust=1, size=8), 
          axis.text.y=element_text(size=10)) + 
  facet_wrap(~rank+K_locus, scales="free_y") +
    theme(legend.position="bottom") +
  scale_fill_manual(values=region_cols)
## Joining with `by = join_by(K_locus)`

ggsave("figures/AppendixFigS1.3_crudeRegionAnnualRateK.pdf", width=7, height=9)
ggsave("figures/AppendixFigS1.3_crudeRegionAnnualRateK.png", width=7, height=9)

Appendix Fig S1.4 - simple est per year per O/region

O_crude_perRegion_perYear <- data_NNS_sites10 %>%
  raw_adj_prop(grouping_vars = c("O_genotype", "Region", "Year"), summarise_by = "Region", adj_vars = c("Cluster", "Region", "Year")) 
## grouping_var: Region in adj_vars, removing from adj_vars
## grouping_var: Year in adj_vars, removing from adj_vars
## Grouping vars: O_genotype, Region, Year
## Summarising by: Region
## Adj vars: Cluster
O_crude_perRegion_perYear <- O_crude_perRegion_perYear %>% group_by(Region, Year) %>% 
  summarise(n=sum(adj_count)) %>% left_join(O_crude_perRegion_perYear) %>%
  mutate(adj_prop = adj_count/n) %>% select(-raw_sum, -adj_sum, -raw_prop, -raw_count)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(Region, Year)`
O_crude_perRegion_perYear %>% 
    left_join(obayes$locus_rank %>% rename(O_genotype=locus)) %>% 
    #filter(rank <=12) %>%
    ggplot(aes(x=Year, fill=Region, y=adj_prop)) + 
    geom_col() +
    theme_bw() + 
    theme(axis.text.x = element_text(angle=45, hjust=1, size=10), 
          axis.text.y=element_text(size=10)) + 
  facet_wrap(~rank+O_genotype, scales="free_y", ncol=3) +
    theme(legend.position="bottom") + 
  scale_fill_manual(values=region_cols) + labs(fill="")
## Joining with `by = join_by(O_genotype)`

ggsave("figures/AppendixFigS1.4_crudeRegionAnnualRateO.pdf", width=6, height=8)
ggsave("figures/AppendixFigS1.4_crudeRegionAnnualRateO.png", width=6, height=8)

Appendix Figure S1.5 - ST vs KL coloured by region, for top 4 O types

o_region_ST <- data_NNS_sites10 %>%
  filter(O_genotype %in% c("O2afg", "O4", "O2a", "O13")) %>%
  mutate(Region=fct_relevel(Region, c("Western Africa", "Southern Africa", "Eastern Africa", "Southern Asia"))) %>%
  ggplot(aes(col=Region, x=K_locus, y=ST)) + 
  geom_jitter(alpha=0.5) +
  theme_bw() + 
  theme(axis.text.x = element_text(angle=45, hjust=1, size=10), 
        axis.text.y=element_text(size=9), legend.position="bottom") +
  scale_colour_manual(values=region_cols) + 
  facet_wrap(~O_genotype, scales="free") + labs(col="", y="", x="")
o_region_ST

ggsave("figures/AppendixFigS1.5_O_STvsKbyRegion.pdf", width=7, height=9)
ggsave("figures/AppendixFigS1.5_O_STvsKbyRegion.png", width=7, height=9)

effect of clustering - modelled estimates

kbayes_global <- full_join(kbayes$global_est, kbayes_raw$global_est, by="locus", suffix=c(".adj", ".raw")) 

bayes_global_K_raw_vs_adj <- kbayes_global %>%
  ggplot(aes(x=mean.raw*100, y=mean.adj*100)) + 
  geom_point() + theme_bw() + 
  labs(x="Bayesian estimate (%), raw", y="Bayesian estimate (%), cluster-adjusted") +
  geom_abline(slope=1, intercept=0) + 
  geom_text(data=kbayes_global[kbayes_global$locus %in% kbayes$locus_rank$locus[1:5],], aes(x=mean.raw*100, y=mean.adj*100, label=locus), hjust=1, nudge_x=-0.2, size=2.5)
obayes_global <- full_join(obayes$global_est, obayes_raw$global_est, by="locus", suffix=c(".adj", ".raw")) 

bayes_global_O_raw_vs_adj <- obayes_global %>%
  ggplot(aes(x=mean.raw*100, y=mean.adj*100)) + 
  geom_point() + theme_bw() + 
  labs(x="Bayesian estimate (%), raw", y="Bayesian estimate (%), cluster-adjusted") +
  geom_abline(slope=1, intercept=0) + 
  geom_text(data=obayes_global[obayes_global$locus %in% obayes$locus_rank$locus[1:9],], aes(x=mean.raw*100, y=mean.adj*100, label=locus), hjust=1, nudge_x=-0.2, size=2.5)

Appendix Figure S2.4 - cluster vs raw, global K

( (bayes_global_K_raw_vs_adj + ggtitle("a) K locus (mean estimate)") ) + (bayes_global_O_raw_vs_adj  + ggtitle("b) O type (mean estimate)")) + patchwork::plot_layout(axes="collect") ) / (k_dist_raw_adj + ggtitle ("c) K locus (posterior distribution)") + o_dist_raw_adj + ggtitle ("d) O type (posterior distribution)") + patchwork::plot_layout(guides="collect")) + patchwork::plot_layout(heights=c(1,2)) & theme(legend.position='bottom')
## Picking joint bandwidth of 0.0595
## Warning: Removed 162 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Picking joint bandwidth of 0.104
## Warning: Removed 118 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

ggsave("figures/AppendixFigS2.4_clusterVsRaw.pdf", width=8, height=11)
## Picking joint bandwidth of 0.0595
## Warning: Removed 162 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Picking joint bandwidth of 0.104
## Warning: Removed 118 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
ggsave("figures/AppendixFigS2.4_clusterVsRaw.png", width=8, height=11)
## Picking joint bandwidth of 0.0595
## Warning: Removed 162 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Picking joint bandwidth of 0.104
## Warning: Removed 118 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

Effect of cluster adjustment on crude proportion per site

K_crude_perSite <- data_NNS_sites10 %>%
  raw_adj_prop(grouping_vars = c("K_locus", "Site"), summarise_by = "Site", adj_vars = c("Cluster", "Site")) %>%
  mutate(raw_prop = raw_count/raw_sum) %>%
  mutate(adj_prop = adj_count/adj_sum)
## grouping_var: Site in adj_vars, removing from adj_vars
## Grouping vars: K_locus, Site
## Summarising by: Site
## Adj vars: Cluster
O_crude_perSite <- data_NNS_sites10 %>%
  raw_adj_prop(grouping_vars = c("O_genotype", "Site"), summarise_by = "Site", adj_vars = c("Cluster", "Site")) %>%
  mutate(raw_prop = raw_count/raw_sum) %>%
  mutate(adj_prop = adj_count/adj_sum)
## grouping_var: Site in adj_vars, removing from adj_vars
## Grouping vars: O_genotype, Site
## Summarising by: Site
## Adj vars: Cluster
# KL25
K_crude_raw_adj_scatter_KL25 <- K_crude_perSite %>% 
  filter(K_locus=="KL25") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# KL2
K_crude_raw_adj_scatter_KL2 <- K_crude_perSite %>% 
  filter(K_locus=="KL2") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# KL102
K_crude_raw_adj_scatter_KL102 <- K_crude_perSite %>% 
  filter(K_locus=="KL102") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# KL62
K_crude_raw_adj_scatter_KL62 <- K_crude_perSite %>% 
  filter(K_locus=="KL62") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# O1v1
O_crude_raw_adj_scatter_O1v1 <- O_crude_perSite %>% 
  filter(O_genotype=="O1.v1") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# O1v2
O_crude_raw_adj_scatter_O1v2 <- O_crude_perSite %>% 
  filter(O_genotype=="O1.v2") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# O2afg
O_crude_raw_adj_scatter_O2afg <- O_crude_perSite %>% 
  filter(O_genotype=="O2afg") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# O4
O_crude_raw_adj_scatter_O4 <- O_crude_perSite %>% 
  filter(O_genotype=="O4") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

Appendix Fig S2.3 - Effect of cluster adjustment on crude proportion

(K_crude_raw_adj_scatter_KL102 + ggtitle("a) KL102, per-site") + K_crude_raw_adj_scatter_KL2 + ggtitle("b) KL2, per-site")) / (K_crude_raw_adj_scatter_KL25 + ggtitle("c) KL25, per-site") + K_crude_raw_adj_scatter_KL62 + ggtitle("d) KL62, per-site")) / (O_crude_raw_adj_scatter_O1v1 + ggtitle("e) O1v1, per-site") + O_crude_raw_adj_scatter_O1v2 + ggtitle("f) O1v2, per-site")) / (O_crude_raw_adj_scatter_O2afg + ggtitle("g) O2afg, per-site") + O_crude_raw_adj_scatter_O4 + ggtitle("h) O4, per-site"))

ggsave("figures/AppendixFigS2.3_EffectClusteringCrude.pdf", width=8, height=10)
ggsave("figures/AppendixFigS2.3_EffectClusteringCrude.png", width=8, height=10)

numbers for text - K prevalence estimates

kbayes$global_est
## # A tibble: 90 × 6
##    locus median   mean  lower  upper  rank
##    <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
##  1 KL2   0.0894 0.0897 0.0711 0.110      1
##  2 KL102 0.0869 0.0872 0.0683 0.107      2
##  3 KL25  0.0855 0.0858 0.0678 0.106      3
##  4 KL15  0.0539 0.0543 0.0396 0.0708     4
##  5 KL62  0.0464 0.0468 0.0332 0.0624     5
##  6 KL30  0.0325 0.0328 0.0218 0.0459     6
##  7 KL10  0.0261 0.0265 0.0166 0.0388     7
##  8 KL17  0.0248 0.0252 0.0155 0.0370     8
##  9 KL23  0.0248 0.0252 0.0157 0.0369     9
## 10 KL51  0.0237 0.0240 0.0147 0.0355    10
## # ℹ 80 more rows
kbayes$global_est %>% filter(rank<=5)
## # A tibble: 5 × 6
##   locus median   mean  lower  upper  rank
##   <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
## 1 KL2   0.0894 0.0897 0.0711 0.110      1
## 2 KL102 0.0869 0.0872 0.0683 0.107      2
## 3 KL25  0.0855 0.0858 0.0678 0.106      3
## 4 KL15  0.0539 0.0543 0.0396 0.0708     4
## 5 KL62  0.0464 0.0468 0.0332 0.0624     5
kbayes$global_est %>% filter(median>0.02)
## # A tibble: 13 × 6
##    locus median   mean  lower  upper  rank
##    <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
##  1 KL2   0.0894 0.0897 0.0711 0.110      1
##  2 KL102 0.0869 0.0872 0.0683 0.107      2
##  3 KL25  0.0855 0.0858 0.0678 0.106      3
##  4 KL15  0.0539 0.0543 0.0396 0.0708     4
##  5 KL62  0.0464 0.0468 0.0332 0.0624     5
##  6 KL30  0.0325 0.0328 0.0218 0.0459     6
##  7 KL10  0.0261 0.0265 0.0166 0.0388     7
##  8 KL17  0.0248 0.0252 0.0155 0.0370     8
##  9 KL23  0.0248 0.0252 0.0157 0.0369     9
## 10 KL51  0.0237 0.0240 0.0147 0.0355    10
## 11 KL112 0.0236 0.0240 0.0147 0.0356    11
## 12 KL24  0.0223 0.0227 0.0135 0.0341    12
## 13 KL149 0.0212 0.0215 0.0128 0.0326    13
kbayes$global_est %>% filter(median>0.01)
## # A tibble: 27 × 6
##    locus median   mean  lower  upper  rank
##    <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
##  1 KL2   0.0894 0.0897 0.0711 0.110      1
##  2 KL102 0.0869 0.0872 0.0683 0.107      2
##  3 KL25  0.0855 0.0858 0.0678 0.106      3
##  4 KL15  0.0539 0.0543 0.0396 0.0708     4
##  5 KL62  0.0464 0.0468 0.0332 0.0624     5
##  6 KL30  0.0325 0.0328 0.0218 0.0459     6
##  7 KL10  0.0261 0.0265 0.0166 0.0388     7
##  8 KL17  0.0248 0.0252 0.0155 0.0370     8
##  9 KL23  0.0248 0.0252 0.0157 0.0369     9
## 10 KL51  0.0237 0.0240 0.0147 0.0355    10
## # ℹ 17 more rows
kbayes$global_est %>% filter(locus=="KL149")
## # A tibble: 1 × 6
##   locus median   mean  lower  upper  rank
##   <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
## 1 KL149 0.0212 0.0215 0.0128 0.0326    13
kbayes$region_est %>% filter(locus=="KL149")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median    mean    lower  upper
##   <chr> <chr>             <dbl>   <dbl>    <dbl>  <dbl>
## 1 KL149 Eastern Africa  0.00433 0.00510 0.000878 0.0134
## 2 KL149 Southern Africa 0.0851  0.0867  0.0475   0.136 
## 3 KL149 Southern Asia   0.00609 0.00720 0.00112  0.0197
## 4 KL149 Western Africa  0.00573 0.00876 0.000475 0.0344
kbayes$global_est %>% filter(locus=="KL15")
## # A tibble: 1 × 6
##   locus median   mean  lower  upper  rank
##   <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
## 1 KL15  0.0539 0.0543 0.0396 0.0708     4
kbayes$region_est %>% filter(locus=="KL15")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median   mean   lower  upper
##   <chr> <chr>             <dbl>  <dbl>   <dbl>  <dbl>
## 1 KL15  Eastern Africa  0.0495  0.0504 0.0303  0.0753
## 2 KL15  Southern Africa 0.00923 0.0109 0.00180 0.0294
## 3 KL15  Southern Asia   0.0916  0.0926 0.0593  0.132 
## 4 KL15  Western Africa  0.0321  0.0369 0.00713 0.0933
kbayes$global_est %>% filter(locus=="KL51")
## # A tibble: 1 × 6
##   locus median   mean  lower  upper  rank
##   <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
## 1 KL51  0.0237 0.0240 0.0147 0.0355    10
kbayes$region_est %>% filter(locus=="KL51")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median    mean    lower  upper
##   <chr> <chr>             <dbl>   <dbl>    <dbl>  <dbl>
## 1 KL51  Eastern Africa  0.00380 0.00455 0.000650 0.0127
## 2 KL51  Southern Africa 0.00225 0.00348 0.000176 0.0134
## 3 KL51  Southern Asia   0.0697  0.0712  0.0425   0.108 
## 4 KL51  Western Africa  0.00431 0.00694 0.000316 0.0295
kbayes$global_est %>% filter(locus=="KL81")
## # A tibble: 1 × 6
##   locus median   mean   lower  upper  rank
##   <chr>  <dbl>  <dbl>   <dbl>  <dbl> <int>
## 1 KL81  0.0123 0.0126 0.00612 0.0214    25
kbayes$region_est %>% filter(locus=="KL81")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median    mean     lower   upper
##   <chr> <chr>             <dbl>   <dbl>     <dbl>   <dbl>
## 1 KL81  Eastern Africa  0.00162 0.00224 0.000153  0.00804
## 2 KL81  Southern Africa 0.00149 0.00251 0.0000947 0.0109 
## 3 KL81  Southern Asia   0.0357  0.0369  0.0169    0.0637 
## 4 KL81  Western Africa  0.00268 0.00494 0.000146  0.0231
kbayes$global_est %>% filter(locus=="KL28")
## # A tibble: 1 × 6
##   locus median   mean   lower  upper  rank
##   <chr>  <dbl>  <dbl>   <dbl>  <dbl> <int>
## 1 KL28  0.0136 0.0140 0.00706 0.0229    22
kbayes$region_est %>% filter(locus=="KL28")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median    mean    lower  upper
##   <chr> <chr>             <dbl>   <dbl>    <dbl>  <dbl>
## 1 KL28  Eastern Africa  0.0134  0.0142  0.00508  0.0279
## 2 KL28  Southern Africa 0.00607 0.00777 0.000918 0.0237
## 3 KL28  Southern Asia   0.00369 0.00475 0.000506 0.0149
## 4 KL28  Western Africa  0.0632  0.0679  0.0208   0.141
kbayes$global_est %>% filter(locus=="KL116")
## # A tibble: 1 × 6
##   locus  median    mean   lower  upper  rank
##   <chr>   <dbl>   <dbl>   <dbl>  <dbl> <int>
## 1 KL116 0.00599 0.00634 0.00218 0.0124    35
kbayes$region_est %>% filter(locus=="KL116")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median    mean    lower   upper
##   <chr> <chr>             <dbl>   <dbl>    <dbl>   <dbl>
## 1 KL116 Eastern Africa  0.00167 0.00230 0.000172 0.00808
## 2 KL116 Southern Africa 0.00159 0.00253 0.000112 0.0104 
## 3 KL116 Southern Asia   0.00218 0.00308 0.000202 0.0110 
## 4 KL116 Western Africa  0.0519  0.0565  0.0149   0.124
kbayes$global_est %>% filter(locus=="KL53")
## # A tibble: 1 × 6
##   locus  median    mean   lower  upper  rank
##   <chr>   <dbl>   <dbl>   <dbl>  <dbl> <int>
## 1 KL53  0.00466 0.00506 0.00140 0.0110    41
kbayes$region_est %>% filter(locus=="KL53")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median    mean     lower   upper
##   <chr> <chr>             <dbl>   <dbl>     <dbl>   <dbl>
## 1 KL53  Eastern Africa  0.00150 0.00209 0.000140  0.00737
## 2 KL53  Southern Africa 0.00138 0.00231 0.0000939 0.0102 
## 3 KL53  Southern Asia   0.00196 0.00285 0.000171  0.0104 
## 4 KL53  Western Africa  0.0361  0.0410  0.00794   0.101

numbers for text - O antigen

obayes$global_est
## # A tibble: 15 × 6
##    locus    median    mean     lower   upper  rank
##    <chr>     <dbl>   <dbl>     <dbl>   <dbl> <int>
##  1 O1.v1  0.263    0.264   0.234     0.294       1
##  2 O1.v2  0.219    0.219   0.191     0.249       2
##  3 O2afg  0.159    0.159   0.134     0.185       3
##  4 O4     0.0853   0.0856  0.0673    0.106       4
##  5 O2a    0.0641   0.0645  0.0489    0.0824      5
##  6 O3b    0.0576   0.0581  0.0429    0.0760      6
##  7 O5     0.0563   0.0568  0.0419    0.0737      7
##  8 OL13   0.0501   0.0506  0.0365    0.0669      8
##  9 O3/O3a 0.0299   0.0303  0.0197    0.0430      9
## 10 OL104  0.00340  0.00380 0.000794  0.00918    10
## 11 OL103  0.00212  0.00253 0.000327  0.00694    11
## 12 O12    0.00211  0.00250 0.000323  0.00690    12
## 13 OL102  0.000873 0.00127 0.0000343 0.00464    13
## 14 O2a.v3 0.000868 0.00127 0.0000351 0.00477    14
## 15 O1.v3  0.000873 0.00126 0.0000291 0.00468    15
obayes$global_est %>% filter(rank<=5) %>% pull(median) %>% sum()
## [1] 0.7905593
obayes$global_est %>% filter(rank<=5) %>% pull(lower) %>% sum()
## [1] 0.6757192
obayes$global_est %>% filter(rank<=5) %>% pull(upper) %>% sum()
## [1] 0.9163594
obayes$global_est
## # A tibble: 15 × 6
##    locus    median    mean     lower   upper  rank
##    <chr>     <dbl>   <dbl>     <dbl>   <dbl> <int>
##  1 O1.v1  0.263    0.264   0.234     0.294       1
##  2 O1.v2  0.219    0.219   0.191     0.249       2
##  3 O2afg  0.159    0.159   0.134     0.185       3
##  4 O4     0.0853   0.0856  0.0673    0.106       4
##  5 O2a    0.0641   0.0645  0.0489    0.0824      5
##  6 O3b    0.0576   0.0581  0.0429    0.0760      6
##  7 O5     0.0563   0.0568  0.0419    0.0737      7
##  8 OL13   0.0501   0.0506  0.0365    0.0669      8
##  9 O3/O3a 0.0299   0.0303  0.0197    0.0430      9
## 10 OL104  0.00340  0.00380 0.000794  0.00918    10
## 11 OL103  0.00212  0.00253 0.000327  0.00694    11
## 12 O12    0.00211  0.00250 0.000323  0.00690    12
## 13 OL102  0.000873 0.00127 0.0000343 0.00464    13
## 14 O2a.v3 0.000868 0.00127 0.0000351 0.00477    14
## 15 O1.v3  0.000873 0.00126 0.0000291 0.00468    15

numbers for text - O differences by region

# where is O4 found
data_NNS_sites10 %>% filter(O_genotype=="O4") %>% group_by(Region, Study, Site, ST, K_locus) %>% count() %>% arrange(-n)
## # A tibble: 50 × 6
## # Groups:   Region, Study, Site, ST, K_locus [50]
##    Region          Study         Site ST    K_locus     n
##    <chr>           <chr>        <dbl> <chr> <chr>   <int>
##  1 Southern Asia   CHRF            40 ST11  KL15       59
##  2 Eastern Africa  BARNARDS        20 ST37  KL15       28
##  3 Southern Africa BabyGERMS       30 ST152 KL149      16
##  4 Southern Asia   CHRF            40 ST502 KL15       15
##  5 Southern Asia   NeoOBS-India     5 ST11  KL15       14
##  6 Southern Africa BabyGERMS       31 ST405 KL151       9
##  7 Eastern Africa  MLW             36 ST45  KL15        8
##  8 Southern Asia   CHRF            41 ST11  KL15        7
##  9 Southern Asia   CHRF            40 ST437 KL36        6
## 10 Southern Asia   DH               2 ST11  KL15        6
## # ℹ 40 more rows
data_NNS_sites10 %>% filter(O_genotype=="O4" & Region=="Southern Asia") %>% group_by(ST, K_locus) %>% count() %>% arrange(-n)
## # A tibble: 16 × 3
## # Groups:   ST, K_locus [16]
##    ST        K_locus     n
##    <chr>     <chr>   <int>
##  1 ST11      KL15       87
##  2 ST502     KL15       15
##  3 ST437     KL36        9
##  4 ST340     KL15        5
##  5 ST152     KL149       3
##  6 ST273     KL15        3
##  7 ST2258    KL131       2
##  8 ST37      KL15        2
##  9 ST231     KL144       1
## 10 ST273     KL141       1
## 11 ST3623    KL15        1
## 12 ST392     KL27        1
## 13 ST429     KL27        1
## 14 ST70      KL15        1
## 15 ST726     KL15        1
## 16 ST883-1LV KL15        1
data_NNS_sites10 %>% filter(O_genotype=="O4" & Region=="Southern Asia") %>% group_by(ST, K_locus, Site) %>% count() %>% arrange(-n)
## # A tibble: 26 × 4
## # Groups:   ST, K_locus, Site [26]
##    ST    K_locus  Site     n
##    <chr> <chr>   <dbl> <int>
##  1 ST11  KL15       40    59
##  2 ST502 KL15       40    15
##  3 ST11  KL15        5    14
##  4 ST11  KL15       41     7
##  5 ST11  KL15        2     6
##  6 ST437 KL36       40     6
##  7 ST152 KL149      40     3
##  8 ST273 KL15       40     3
##  9 ST340 KL15       38     2
## 10 ST340 KL15       40     2
## # ℹ 16 more rows
data_NNS_sites10 %>% filter(O_genotype=="O2a.v1") %>% group_by(Region, Study, Site, ST, K_locus) %>% count() %>% arrange(-n)
## # A tibble: 0 × 6
## # Groups:   Region, Study, Site, ST, K_locus [0]
## # ℹ 6 variables: Region <chr>, Study <chr>, Site <dbl>, ST <chr>,
## #   K_locus <chr>, n <int>
data_NNS_sites10 %>% filter(O_genotype=="O2a.v1" & Region=="Southern Asia") %>% group_by(ST, K_locus) %>% count() %>% arrange(-n)
## # A tibble: 0 × 3
## # Groups:   ST, K_locus [0]
## # ℹ 3 variables: ST <chr>, K_locus <chr>, n <int>
data_NNS_sites10 %>% filter(O_genotype=="O2a.v1" & Region=="Southern Asia") %>% group_by(ST, K_locus, Site) %>% count() %>% arrange(-n)
## # A tibble: 0 × 4
## # Groups:   ST, K_locus, Site [0]
## # ℹ 4 variables: ST <chr>, K_locus <chr>, Site <dbl>, n <int>
data_NNS_sites10 %>% filter(O_genotype=="O5") %>% group_by(Region, Study, Site, ST, K_locus) %>% count() %>% arrange(-n)
## # A tibble: 21 × 6
## # Groups:   Region, Study, Site, ST, K_locus [21]
##    Region          Study      Site ST    K_locus     n
##    <chr>           <chr>     <dbl> <chr> <chr>   <int>
##  1 Eastern Africa  Kilifi       50 ST17  KL25       15
##  2 Southern Africa GBS          45 ST17  KL25        9
##  3 Southern Africa Botswana     59 ST17  KL25        8
##  4 Southern Africa BabyGERMS    33 ST17  KL25        7
##  5 Southern Africa BabyGERMS    32 ST17  KL25        4
##  6 Eastern Africa  Kilifi       50 ST336 KL25        3
##  7 Eastern Africa  MLW          36 ST17  KL25        3
##  8 Southern Africa BabyGERMS    31 ST17  KL25        3
##  9 Southern Asia   AKU          11 ST17  KL25        3
## 10 Southern Africa BabyGERMS    30 ST17  KL25        2
## # ℹ 11 more rows
data_NNS_sites10 %>% filter(O_genotype=="O5" & Region=="Southern Africa") %>% group_by(ST, K_locus) %>% count() %>% arrange(-n)
## # A tibble: 3 × 3
## # Groups:   ST, K_locus [3]
##   ST     K_locus     n
##   <chr>  <chr>   <int>
## 1 ST17   KL25       37
## 2 ST2621 KL25        1
## 3 ST3184 KL25        1
data_NNS_sites10 %>% filter(O_genotype=="O5" & Region=="Southern Africa") %>% group_by(ST, K_locus, Site) %>% count() %>% arrange(-n)
## # A tibble: 10 × 4
## # Groups:   ST, K_locus, Site [10]
##    ST     K_locus  Site     n
##    <chr>  <chr>   <dbl> <int>
##  1 ST17   KL25       45     9
##  2 ST17   KL25       59     8
##  3 ST17   KL25       33     7
##  4 ST17   KL25       32     4
##  5 ST17   KL25       31     3
##  6 ST17   KL25       30     2
##  7 ST17   KL25       34     2
##  8 ST17   KL25       35     2
##  9 ST2621 KL25       33     1
## 10 ST3184 KL25       35     1

Global and regional coverage - top20 global - Fig3bi

kbayes_raw_coverage <- getCumulativeCoverageGlobalRegional(kbayes_raw, kbayes$locus_rank) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

coverage_globalTop20_globalRegional <- plotCoverage(kbayes_raw_coverage, kbayes$locus_rank, maxRank=20, cols=region_cols)

coverage_globalTop20_globalRegional

fatal - read posterior estimates for adjusted counts (main) and raw counts (for comparison)

# read raw Bayesian estimates for K - using fatal samples from sites with at least 10 fatal
kbayes_fatal_raw_min10perSite <- parseModelledEstimates(global_post="outputs_core/K_Fatal_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/K_Fatal_min10_raw_28_posterior_subgroup.csv.gz")
## Rows: 576000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2304000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
kbayes_fatal_raw_min10perSite_coverage <- getCumulativeCoverage(kbayes$locus_rank, kbayes_fatal_raw_min10perSite$global_post) %>% mutate(type="min10")

ESBL - read posterior estimates for adjusted counts (main) and raw counts (for comparison)

kbayes_esbl_raw_min10perSite <- parseModelledEstimates(global_post="outputs_core/K_ESBL_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/K_ESBL_min10_raw_28_posterior_subgroup.csv.gz")
## Rows: 960000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3840000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
kbayes_esbl_raw_min10perSite_coverage <- getCumulativeCoverage(kbayes$locus_rank, kbayes_esbl_raw_min10perSite$global_post) %>% mutate(type="min10")

CP - read posterior estimates for adjusted counts (main) and raw counts (for comparison)

kbayes_cp_raw_min10perSite <- parseModelledEstimates(global_post="outputs_core/K_Carba_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/K_Carba_min10_raw_28_posterior_subgroup.csv.gz")
## Rows: 588000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1764000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
kbayes_cp_raw_min10perSite_coverage <- getCumulativeCoverage(kbayes$locus_rank, kbayes_cp_raw_min10perSite$global_post) %>% mutate(type="min10")

combine coverage estimates for subgroups (fatal, ESBL, CP) - Fig3ai

kbayes_raw_coverage_subgroups <- kbayes_esbl_raw_min10perSite_coverage %>% mutate(subgroup="ESBL") %>%
  bind_rows(kbayes_cp_raw_min10perSite_coverage %>% mutate(subgroup="CP")) %>%
  bind_rows(kbayes_fatal_raw_min10perSite_coverage %>% mutate(subgroup="Fatal")) %>%
  bind_rows(kbayes_raw_coverage %>% filter(subgroup=="Global") %>% mutate(subgroup="All") ) %>%
  mutate(subgroup=fct_relevel(subgroup,c("All", "Fatal", "ESBL", "CP")))

coverage_globalTop20_subgroups <- plotCoverage(kbayes_raw_coverage_subgroups, kbayes$locus_rank, maxRank=20, cols=group_cols)

coverage_globalTop20_subgroups

take top 8 per region - Fig3aii and Fig3bii

## extract cluster-adjusted coverage per region
K_rank_SA <- kbayes$region_est %>% ungroup %>% filter(subgroup=="Southern Africa") %>% arrange(-mean) %>% mutate(rank=row_number())
K_rank_WA <- kbayes$region_est %>% ungroup %>% filter(subgroup=="Western Africa") %>% arrange(-mean) %>% mutate(rank=row_number())
K_rank_EA <- kbayes$region_est %>% ungroup %>% filter(subgroup=="Eastern Africa") %>% arrange(-mean) %>% mutate(rank=row_number())
K_rank_SAs <- kbayes$region_est %>% ungroup %>% filter(subgroup=="Southern Asia") %>% arrange(-mean) %>% mutate(rank=row_number())


# get top-8 within each region - gives 21 loci
n<-8

topN <- bind_rows(K_rank_SA, K_rank_EA) %>% bind_rows(K_rank_WA) %>% bind_rows(K_rank_SAs) %>% filter(rank<=n) %>% pull(locus) %>% unique()

length(unique(topN))
## [1] 21
kbayes$locus_rank[kbayes$locus_rank$locus %in% topN,]
## # A tibble: 21 × 2
##    locus  rank
##    <chr> <int>
##  1 KL2       1
##  2 KL102     2
##  3 KL25      3
##  4 KL15      4
##  5 KL62      5
##  6 KL30      6
##  7 KL10      7
##  8 KL17      8
##  9 KL23      9
## 10 KL51     10
## # ℹ 11 more rows
# remove KL8, as this only benefits Southern Africa which already exceeds 80% without KL9, and brings us back to 20 loci
k_rank_region8 <- kbayes$locus_rank[kbayes$locus_rank$locus %in% topN,] %>% 
  filter(locus!="KL9") %>%
  mutate(rank=1:(length(unique(topN))-1))

# calculate global & regional coverage for these K - Fig 3bii
kbayes_raw_coverage_top8perRegion <- getCumulativeCoverageGlobalRegional(kbayes_raw, k_rank_region8) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

coverage_global_top8perRegion <- plotCoverage(kbayes_raw_coverage_top8perRegion, k_rank_region8, maxRank=nrow(k_rank_region8), cols=region_cols)

coverage_global_top8perRegion

# calculate subgroup coverage for these K - Fig 3aii
kbayes_fatal_raw_min10perSite_coverage_top8perRegion <- getCumulativeCoverage(k_rank_region8, kbayes_fatal_raw_min10perSite$global_post)
kbayes_esbl_raw_min10perSite_coverage_top8perRegion <- getCumulativeCoverage(k_rank_region8, kbayes_esbl_raw_min10perSite$global_post)
kbayes_cp_raw_min10perSite_coverage_top8perRegion <- getCumulativeCoverage(k_rank_region8, kbayes_cp_raw_min10perSite$global_post)

kbayes_raw_coverage_subgroups_top8perRegion <- kbayes_esbl_raw_min10perSite_coverage_top8perRegion %>% mutate(subgroup="ESBL") %>%
  bind_rows(kbayes_cp_raw_min10perSite_coverage_top8perRegion %>% mutate(subgroup="CP")) %>%
  bind_rows(kbayes_fatal_raw_min10perSite_coverage_top8perRegion %>% mutate(subgroup="Fatal")) %>%
  bind_rows(kbayes_raw_coverage_top8perRegion %>% filter(subgroup=="Global") %>% mutate(subgroup="All") ) %>%
  mutate(subgroup=fct_relevel(subgroup,c("All", "Fatal", "ESBL", "CP")))

coverage_subgroups_top8perRegion <- plotCoverage(kbayes_raw_coverage_subgroups_top8perRegion, k_rank_region8, maxRank=nrow(k_rank_region8), cols=group_cols)

coverage_subgroups_top8perRegion

(coverage_globalTop20_subgroups + ggtitle("a) Global coverage, by case type", subtitle="KL set 1: global top-20") + coverage_subgroups_top8perRegion + ggtitle("", subtitle="KL set 2: top-8 per region") + patchwork::plot_layout(guides="collect", axes="collect")) / (coverage_globalTop20_globalRegional + ggtitle("b) Per-region coverage", subtitle="KL set 1: global top-20") + coverage_global_top8perRegion + ggtitle("", subtitle="KL set 2: top-8 per region") + patchwork::plot_layout(guides="collect", axes="collect"))

ggsave("figures/Fig3_Kcoverage_raw.pdf", width=8, height=8)
ggsave("figures/Fig3_Kcoverage_raw.png", width=8, height=8)

Table 2 - coverage stats for all/ESBL/CP/fatal, for global and regional

kbayes_fatal_raw_min10perSite_coverageRegional <- getCumulativeCoverageGlobalRegional(ranks=kbayes$locus_rank, bayes=kbayes_fatal_raw_min10perSite, maxRank=20)

kbayes_esbl_raw_min10perSite_coverageRegional <- getCumulativeCoverageGlobalRegional(ranks=kbayes$locus_rank, bayes=kbayes_esbl_raw_min10perSite, maxRank=20)

kbayes_cp_raw_min10perSite_coverageRegional <- getCumulativeCoverageGlobalRegional(ranks=kbayes$locus_rank, bayes=kbayes_cp_raw_min10perSite, maxRank=20)

table2 <- kbayes_raw_coverage %>% mutate(cases="All") %>%
  bind_rows(kbayes_fatal_raw_min10perSite_coverageRegional %>% mutate(cases="  Fatal infections")) %>%
  bind_rows(kbayes_esbl_raw_min10perSite_coverageRegional %>% mutate(cases="  ESBL infections")) %>%
  bind_rows(kbayes_cp_raw_min10perSite_coverageRegional %>% mutate(cases="  CP infections")) %>%
  filter(rank %in% c(5,10,15,20)) %>% 
  mutate(upper=if_else(upper>1, 1, upper)) %>%
  mutate(summary=paste0(round(mean,3)*100," (",round(lower,3)*100,"-",round(upper,3)*100,")")) %>% 
  select(rank, subgroup, cases, summary) %>% 
  pivot_wider(names_from=rank, values_from=summary, id_cols=c(subgroup, cases)) %>%
  rename(Region=subgroup)

add totals per subgroup

fatal_counts <- data_NNS_sites10 %>% filter(Mortality==1) %>% group_by(Region) %>% count()
fatal_counts <- fatal_counts %>% bind_rows(list(Region="Global", n=sum(fatal_counts$n)))

esbl_counts <- data_NNS_sites10 %>% filter(resistance_score>=1) %>% group_by(Region) %>% count()
esbl_counts <- esbl_counts %>% bind_rows(list(Region="Global", n=sum(esbl_counts$n)))

cp_counts <- data_NNS_sites10 %>% filter(resistance_score>=2) %>% group_by(Region) %>% count()
cp_counts <- cp_counts %>% bind_rows(list(Region="Global", n=sum(cp_counts$n)))

total_counts <- data_NNS_sites10 %>% group_by(Region) %>% count()
total_counts <- total_counts %>% bind_rows(list(Region="Global", n=sum(cp_counts$n)))
table2 <- esbl_counts %>% mutate(cases="  ESBL infections") %>%
  bind_rows(cp_counts%>% mutate(cases="  CP infections")) %>%
  bind_rows(fatal_counts %>% mutate(cases="  Fatal infections")) %>%
  bind_rows(total_counts %>% mutate(cases="All")) %>%
  full_join(table2, by=c("Region", "cases")) %>%
  mutate(cases=fct_relevel(cases,c("All", "  Fatal infections", "  ESBL infections", "  CP infections")))

# sort function isn't working when ordering region factor using fct_relevel
table2$Region <- factor(table2$Region, levels = c("Global", "Eastern Africa", "Southern Africa", "Western Africa", "Southern Asia"))

table2 <- table2 %>%
  arrange(Region, cases) %>%
  relocate(n, .after=cases) %>% 
  mutate(cases=if_else(cases=="All", Region, cases)) %>%
  rename_with(~ paste0(.x, " KL"), c("5", "10", "15", "20")) %>%
  rename(N=n) %>%
  rename(Data=cases) %>%
  ungroup %>% select(-Region)
  
write_tsv(table2, file="tables/Table2_coverage.tsv")

Table S4 - K set2 coverage stats for all/ESBL/CP/fatal, for global and regional

kbayes_fatal_raw_min10perSite_coverageRegional_top8perRegion <- getCumulativeCoverageGlobalRegional(ranks=k_rank_region8, bayes=kbayes_fatal_raw_min10perSite, maxRank=20)

kbayes_esbl_raw_min10perSite_coverageRegional_top8perRegion <- getCumulativeCoverageGlobalRegional(ranks=k_rank_region8, bayes=kbayes_esbl_raw_min10perSite, maxRank=20)

kbayes_cp_raw_min10perSite_coverageRegional_top8perRegion <- getCumulativeCoverageGlobalRegional(ranks=k_rank_region8, bayes=kbayes_cp_raw_min10perSite, maxRank=20)

tableS4 <- kbayes_raw_coverage_top8perRegion %>% mutate(cases="All") %>%
  bind_rows(kbayes_fatal_raw_min10perSite_coverageRegional_top8perRegion %>% mutate(cases="  Fatal infections")) %>%
  bind_rows(kbayes_esbl_raw_min10perSite_coverageRegional_top8perRegion %>% mutate(cases="  ESBL infections")) %>%
  bind_rows(kbayes_cp_raw_min10perSite_coverageRegional_top8perRegion %>% mutate(cases="  CP infections")) %>%
  mutate(upper=if_else(upper>1, 1, upper)) %>%
  filter(rank %in% c(5,10,15,20)) %>% 
  mutate(summary=paste0(round(mean,3)*100," (",round(lower,3)*100,"-",round(upper,3)*100,")")) %>% 
  select(rank, subgroup, cases, summary) %>% 
  pivot_wider(names_from=rank, values_from=summary, id_cols=c(subgroup, cases)) %>%
  rename(Region=subgroup)
tableS4 <- esbl_counts %>% mutate(cases="  ESBL infections") %>%
  bind_rows(cp_counts%>% mutate(cases="  CP infections")) %>%
  bind_rows(fatal_counts %>% mutate(cases="  Fatal infections")) %>%
  bind_rows(total_counts %>% mutate(cases="All")) %>%
  full_join(tableS4, by=c("Region", "cases")) %>%
  mutate(cases=fct_relevel(cases,c("All", "  Fatal infections", "  ESBL infections", "  CP infections")))

# sort function isn't working when ordering region factor using fct_relevel
tableS4$Region <- factor(tableS4$Region, levels = c("Global", "Eastern Africa", "Southern Africa", "Western Africa", "Southern Asia"))

tableS4 <- tableS4 %>%
  arrange(Region, cases) %>%
  relocate(n, .after=cases) %>% 
  mutate(cases=if_else(cases=="All", Region, cases)) %>%
  rename_with(~ paste0(.x, " KL"), c("5", "10", "15", "20")) %>%
  rename(N=n) %>%
  rename(Data=cases) %>%
  ungroup %>% select(-Region)
  
write_tsv(tableS4, file="tables/TableS4_coverageSet2.tsv")

Fig S4 - coverage for region-focused K sets

# ranked by Southern Africa
kbayes_raw_coverage_top20SA <- getCumulativeCoverageGlobalRegional(kbayes_raw, K_rank_SA %>% select(locus, rank), maxRank=20) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

kbayes_raw_coverage_globalRegional_top20SA <- plotCoverage(kbayes_raw_coverage_top20SA, K_rank_SA %>% select(locus, rank), maxRank=20, cols=region_cols)

# ranked by Eastern Africa
kbayes_raw_coverage_top20EA <- getCumulativeCoverageGlobalRegional(kbayes_raw, K_rank_EA %>% select(locus, rank), maxRank=20) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

kbayes_raw_coverage_globalRegional_top20EA <- plotCoverage(kbayes_raw_coverage_top20EA, K_rank_EA %>% select(locus, rank), maxRank=20, cols=region_cols)

# ranked by Western Africa
kbayes_raw_coverage_top20WA <- getCumulativeCoverageGlobalRegional(kbayes_raw, K_rank_WA %>% select(locus, rank), maxRank=20) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

kbayes_raw_coverage_globalRegional_top20WA <- plotCoverage(kbayes_raw_coverage_top20WA, K_rank_WA %>% select(locus, rank), maxRank=20, cols=region_cols)

# ranked by Southern Asia
kbayes_raw_coverage_top20SAs <- getCumulativeCoverageGlobalRegional(kbayes_raw, K_rank_SAs %>% select(locus, rank), maxRank=20) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

kbayes_raw_coverage_globalRegional_top20SAs <- plotCoverage(kbayes_raw_coverage_top20SAs, K_rank_SAs %>% select(locus, rank), maxRank=20, cols=region_cols)

(kbayes_raw_coverage_globalRegional_top20SA + ggtitle("Top-20 K loci in S. Africa") + kbayes_raw_coverage_globalRegional_top20EA + ggtitle("Top-20 K loci in E. Africa")) / (kbayes_raw_coverage_globalRegional_top20WA + ggtitle("Top-20 K loci in W. Africa") + kbayes_raw_coverage_globalRegional_top20SAs + ggtitle("Top-20 K loci in S. Asia")) + patchwork::plot_layout(guides="collect")

ggsave("figures/FigS4_KcoverageRaw_top20PerRegion.pdf", width=8, height=8)
ggsave("figures/FigS4_KcoverageRaw_top20PerRegion.png", width=8, height=8)

numbers for text - K coverage

# coverage of top-20
kbayes_raw_coverage %>% filter(rank %in% c(5,10,15,20) & subgroup=="Global") %>% select(mean, lower, upper)
## # A tibble: 4 × 3
##    mean lower upper
##   <dbl> <dbl> <dbl>
## 1 0.488 0.460 0.518
## 2 0.588 0.557 0.621
## 3 0.671 0.637 0.706
## 4 0.736 0.700 0.773
kbayes_raw_coverage %>% filter(rank==10 & subgroup=="Global") %>% select(mean, lower, upper) - (kbayes_raw_coverage %>% filter(rank==5 & subgroup=="Global") %>% select(mean, lower, upper))
##         mean      lower    upper
## 1 0.09971156 0.09681222 0.102871
kbayes_raw_coverage %>% filter(rank==15 & subgroup=="Global") %>% select(mean, lower, upper) - (kbayes_raw_coverage %>% filter(rank==10 & subgroup=="Global") %>% select(mean, lower, upper))
##        mean     lower      upper
## 1 0.0831687 0.0806286 0.08563087
kbayes_raw_coverage %>% filter(rank==20 & subgroup=="Global") %>% select(mean, lower, upper) - (kbayes_raw_coverage %>% filter(rank==15 & subgroup=="Global") %>% select(mean, lower, upper))
##        mean      lower      upper
## 1 0.0645007 0.06241249 0.06650469
# available data on fatal cases
data_NNS_sites10 %>% filter(!is.na(Mortality)) %>% nrow()
## [1] 1053
data_NNS_sites10 %>% filter(Mortality==1) %>% nrow()
## [1] 390
# coverage of region-focused subsets
kbayes_raw_coverage_globalRegional_top20SA$data %>% filter(rank==20)
## # A tibble: 5 × 7
##    rank  mean median lower upper locus subgroup       
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>          
## 1    20 0.706  0.706 0.671 0.741 KL19  Global         
## 2    20 0.727  0.727 0.680 0.773 KL19  Eastern Africa 
## 3    20 0.591  0.591 0.529 0.657 KL19  Southern Asia  
## 4    20 0.909  0.908 0.798 1     KL19  Southern Africa
## 5    20 0.500  0.497 0.362 0.656 KL19  Western Africa
kbayes_raw_coverage_globalRegional_top20EA$data %>% filter(rank==20)
## # A tibble: 5 × 7
##    rank  mean median lower upper locus subgroup       
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>          
## 1    20 0.716  0.715 0.680 0.751 KL38  Global         
## 2    20 0.808  0.807 0.758 0.858 KL38  Eastern Africa 
## 3    20 0.577  0.577 0.516 0.642 KL38  Southern Asia  
## 4    20 0.614  0.613 0.525 0.708 KL38  Southern Africa
## 5    20 0.603  0.600 0.452 0.774 KL38  Western Africa
kbayes_raw_coverage_globalRegional_top20WA$data %>% filter(rank==20)
## # A tibble: 5 × 7
##    rank  mean median lower upper locus subgroup       
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>          
## 1    20 0.699  0.699 0.664 0.735 KL30  Global         
## 2    20 0.788  0.788 0.740 0.838 KL30  Eastern Africa 
## 3    20 0.527  0.526 0.468 0.588 KL30  Southern Asia  
## 4    20 0.612  0.610 0.523 0.706 KL30  Southern Africa
## 5    20 0.799  0.796 0.621 0.996 KL30  Western Africa
kbayes_raw_coverage_globalRegional_top20SAs$data %>% filter(rank==20)
## # A tibble: 5 × 7
##    rank  mean median lower upper locus subgroup       
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>          
## 1    20 0.705  0.705 0.669 0.741 KL122 Global         
## 2    20 0.696  0.696 0.650 0.742 KL122 Eastern Africa 
## 3    20 0.802  0.800 0.728 0.878 KL122 Southern Asia  
## 4    20 0.624  0.623 0.534 0.721 KL122 Southern Africa
## 5    20 0.474  0.471 0.340 0.627 KL122 Western Africa
# coverage of KL set 2
kbayes_raw_coverage_top8perRegion%>% filter(rank==20)
## # A tibble: 5 × 7
##    rank  mean median lower upper locus subgroup       
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>          
## 1    20 0.722  0.722 0.686 0.759 KL53  Global         
## 2    20 0.704  0.703 0.658 0.750 KL53  Eastern Africa 
## 3    20 0.724  0.723 0.654 0.795 KL53  Southern Asia  
## 4    20 0.810  0.809 0.706 0.919 KL53  Southern Africa
## 5    20 0.698  0.695 0.536 0.882 KL53  Western Africa
kbayes_raw_coverage_subgroups_top8perRegion %>% filter(rank==20)
## # A tibble: 4 × 7
##    rank  mean median lower upper locus subgroup
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>   
## 1    20 0.739  0.739 0.702 0.776 KL53  ESBL    
## 2    20 0.809  0.808 0.736 0.884 KL53  CP      
## 3    20 0.690  0.689 0.614 0.770 KL53  Fatal   
## 4    20 0.722  0.722 0.686 0.759 KL53  All
# what is extra in set2 that is not in top-20?
k_rank_region8 %>% left_join(kbayes$locus, by="locus") %>% filter(rank.y>20)
## # A tibble: 5 × 3
##   locus rank.x rank.y
##   <chr>  <int>  <int>
## 1 KL28      16     22
## 2 KL81      17     25
## 3 KL8       18     27
## 4 KL116     19     35
## 5 KL53      20     41

Global and regional O coverage - top10 global - Fig4d

obayes_raw_coverage <- getCumulativeCoverageGlobalRegional(obayes_raw, obayes$locus_rank) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

ocoverage_globalTop10_globalRegional <- plotCoverage(obayes_raw_coverage, obayes$locus_rank, maxRank=10, xintercept=c(5,10), cols=region_cols, col_title = "d) Region")

ocoverage_globalTop10_globalRegional

O fatal - read posterior estimates for adjusted counts (main) and raw counts (for comparison)

obayes_fatal_raw_min10perSite <- parseModelledEstimates(global_post="outputs_core/O_Fatal_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/O_Fatal_min10_raw_28_posterior_subgroup.csv.gz", fixOnames = T)
## Rows: 132000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 528000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
obayes_fatal_raw_min10perSite_coverage <- getCumulativeCoverage(obayes$locus_rank, obayes_fatal_raw_min10perSite$global_post) %>% mutate(type="min10")

ESBL - read posterior estimates for adjusted counts (main) and raw counts (for comparison)

obayes_esbl_raw_min10perSite <- parseModelledEstimates(global_post="outputs_core/O_ESBL_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/O_ESBL_min10_raw_28_posterior_subgroup.csv.gz", fixOnames = T)
## Rows: 156000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 624000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
obayes_esbl_raw_min10perSite_coverage <- getCumulativeCoverage(obayes$locus_rank, obayes_esbl_raw_min10perSite$global_post) %>% mutate(type="min10")

CP - read posterior estimates for adjusted counts (main) and raw counts (for comparison)

obayes_cp_raw_min10perSite <- parseModelledEstimates(global_post="outputs_core/O_Carba_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/O_Carba_min10_raw_28_posterior_subgroup.csv.gz", fixOnames = T)
## Rows: 120000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 360000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
obayes_cp_raw_min10perSite_coverage <- getCumulativeCoverage(obayes$locus_rank, obayes_cp_raw_min10perSite$global_post) %>% mutate(type="min10")

combine coverage estimates for subgroups (fatal, ESBL, CP) - Fig4c

obayes_raw_coverage_subgroups <- obayes_esbl_raw_min10perSite_coverage %>% mutate(subgroup="ESBL") %>%
  bind_rows(obayes_cp_raw_min10perSite_coverage %>% mutate(subgroup="CP")) %>%
  bind_rows(obayes_fatal_raw_min10perSite_coverage %>% mutate(subgroup="Fatal")) %>%
  bind_rows(obayes_raw_coverage %>% filter(subgroup=="Global") %>% mutate(subgroup="All") ) %>%
  mutate(subgroup=fct_relevel(subgroup,c("All", "Fatal", "ESBL", "CP")))

o_coverage_globalTop10_subgroups <- plotCoverage(obayes_raw_coverage_subgroups, obayes$locus_rank, maxRank=10, xintercept=c(5,10), cols=group_cols, col_title = "c) Subgroup")

o_coverage_globalTop10_subgroups

(o_prev_global_dist + o_prev_region_heatmap) / (o_coverage_globalTop10_subgroups + ggtitle("c) Global coverage") + ocoverage_globalTop10_globalRegional + ggtitle("d) Regional coverage") + patchwork::plot_layout(guides="collect"))
## Picking joint bandwidth of 0.0913
## Warning: Removed 118 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

ggsave("figures/Fig4_O_prevAdj_coverageRaw.pdf", width=8, height=8)
## Picking joint bandwidth of 0.0913
## Warning: Removed 118 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
ggsave("figures/Fig4_O_prevAdj_coverageRaw.png", width=8, height=8)
## Picking joint bandwidth of 0.0913
## Warning: Removed 118 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

numbers for text - O prevalence and coverage

# O prevalence
obayes$global_est
## # A tibble: 15 × 6
##    locus    median    mean     lower   upper  rank
##    <chr>     <dbl>   <dbl>     <dbl>   <dbl> <int>
##  1 O1.v1  0.263    0.264   0.234     0.294       1
##  2 O1.v2  0.219    0.219   0.191     0.249       2
##  3 O2afg  0.159    0.159   0.134     0.185       3
##  4 O4     0.0853   0.0856  0.0673    0.106       4
##  5 O2a    0.0641   0.0645  0.0489    0.0824      5
##  6 O3b    0.0576   0.0581  0.0429    0.0760      6
##  7 O5     0.0563   0.0568  0.0419    0.0737      7
##  8 OL13   0.0501   0.0506  0.0365    0.0669      8
##  9 O3/O3a 0.0299   0.0303  0.0197    0.0430      9
## 10 OL104  0.00340  0.00380 0.000794  0.00918    10
## 11 OL103  0.00212  0.00253 0.000327  0.00694    11
## 12 O12    0.00211  0.00250 0.000323  0.00690    12
## 13 OL102  0.000873 0.00127 0.0000343 0.00464    13
## 14 O2a.v3 0.000868 0.00127 0.0000351 0.00477    14
## 15 O1.v3  0.000873 0.00126 0.0000291 0.00468    15
obayes$region_est
## # A tibble: 60 × 6
## # Groups:   locus [15]
##    locus subgroup          median    mean     lower   upper
##    <chr> <chr>              <dbl>   <dbl>     <dbl>   <dbl>
##  1 O1.v1 Eastern Africa  0.284    0.285   0.240     0.332  
##  2 O1.v1 Southern Africa 0.261    0.263   0.199     0.332  
##  3 O1.v1 Southern Asia   0.224    0.225   0.175     0.278  
##  4 O1.v1 Western Africa  0.294    0.297   0.196     0.415  
##  5 O1.v2 Eastern Africa  0.228    0.228   0.187     0.272  
##  6 O1.v2 Southern Africa 0.221    0.222   0.163     0.288  
##  7 O1.v2 Southern Asia   0.193    0.195   0.148     0.248  
##  8 O1.v2 Western Africa  0.257    0.260   0.162     0.371  
##  9 O1.v3 Eastern Africa  0.000836 0.00131 0.0000268 0.00531
## 10 O1.v3 Southern Africa 0.000567 0.00103 0.0000166 0.00480
## # ℹ 50 more rows
# coverage
obayes_raw_coverage
## # A tibble: 150 × 7
##     rank  mean median lower upper locus  subgroup
##    <int> <dbl>  <dbl> <dbl> <dbl> <chr>  <fct>   
##  1     1 0.253  0.252 0.233 0.273 O1.v1  Global  
##  2     2 0.433  0.433 0.407 0.460 O1.v2  Global  
##  3     3 0.682  0.682 0.650 0.715 O2afg  Global  
##  4     4 0.804  0.804 0.769 0.840 O4     Global  
##  5     5 0.855  0.855 0.819 0.893 O2a    Global  
##  6     6 0.883  0.883 0.845 0.920 O3b    Global  
##  7     7 0.921  0.921 0.883 0.959 O5     Global  
##  8     8 0.969  0.969 0.930 1.01  OL13   Global  
##  9     9 0.990  0.990 0.951 1.03  O3/O3a Global  
## 10    10 0.992  0.992 0.952 1.03  OL104  Global  
## # ℹ 140 more rows
# STs with O4 or O2a in Southern Asia
data_NNS_sites10 %>% filter(Region=="Southern Asia" & O_genotype=="O4") %>% group_by(ST, K_locus)%>% count()
## # A tibble: 16 × 3
## # Groups:   ST, K_locus [16]
##    ST        K_locus     n
##    <chr>     <chr>   <int>
##  1 ST11      KL15       87
##  2 ST152     KL149       3
##  3 ST2258    KL131       2
##  4 ST231     KL144       1
##  5 ST273     KL141       1
##  6 ST273     KL15        3
##  7 ST340     KL15        5
##  8 ST3623    KL15        1
##  9 ST37      KL15        2
## 10 ST392     KL27        1
## 11 ST429     KL27        1
## 12 ST437     KL36        9
## 13 ST502     KL15       15
## 14 ST70      KL15        1
## 15 ST726     KL15        1
## 16 ST883-1LV KL15        1
data_NNS_sites10 %>% filter(Region=="Southern Asia" & O_genotype=="O4") %>% group_by(ST, K_locus, Site)%>% count()
## # A tibble: 26 × 4
## # Groups:   ST, K_locus, Site [26]
##    ST     K_locus  Site     n
##    <chr>  <chr>   <dbl> <int>
##  1 ST11   KL15        2     6
##  2 ST11   KL15        5    14
##  3 ST11   KL15       38     1
##  4 ST11   KL15       40    59
##  5 ST11   KL15       41     7
##  6 ST152  KL149      40     3
##  7 ST2258 KL131      38     1
##  8 ST2258 KL131      41     1
##  9 ST231  KL144      11     1
## 10 ST273  KL141      40     1
## # ℹ 16 more rows
data_NNS_sites10 %>% filter(Region=="Southern Asia" & O_genotype=="O2a") %>% group_by(ST, K_locus)%>% count()
## # A tibble: 9 × 3
## # Groups:   ST, K_locus [9]
##   ST        K_locus     n
##   <chr>     <chr>   <int>
## 1 ST11      KL24        7
## 2 ST1306    KL146       1
## 3 ST147     KL48        1
## 4 ST147     KL64        6
## 5 ST16      KL48        6
## 6 ST268-1LV KL20        1
## 7 ST2805    KL110       2
## 8 ST441-1LV KL62        1
## 9 ST45      KL24        6
data_NNS_sites10 %>% filter(Region=="Southern Asia" & O_genotype=="O2a") %>% group_by(ST, K_locus, Site)%>% count()
## # A tibble: 17 × 4
## # Groups:   ST, K_locus, Site [17]
##    ST        K_locus  Site     n
##    <chr>     <chr>   <dbl> <int>
##  1 ST11      KL24        6     4
##  2 ST11      KL24       26     1
##  3 ST11      KL24       38     2
##  4 ST1306    KL146      15     1
##  5 ST147     KL48       40     1
##  6 ST147     KL64       10     2
##  7 ST147     KL64       11     1
##  8 ST147     KL64       15     1
##  9 ST147     KL64       40     1
## 10 ST147     KL64       41     1
## 11 ST16      KL48        5     2
## 12 ST16      KL48       40     4
## 13 ST268-1LV KL20        6     1
## 14 ST2805    KL110      38     2
## 15 ST441-1LV KL62       41     1
## 16 ST45      KL24        2     5
## 17 ST45      KL24       15     1
data_NNS_sites10 %>% filter(Region=="Southern Africa" & O_genotype=="O5") %>% group_by(ST, K_locus)%>% count()
## # A tibble: 3 × 3
## # Groups:   ST, K_locus [3]
##   ST     K_locus     n
##   <chr>  <chr>   <int>
## 1 ST17   KL25       37
## 2 ST2621 KL25        1
## 3 ST3184 KL25        1
data_NNS_sites10 %>% filter(Region=="Southern Africa" & O_genotype=="O5") %>% group_by(ST, K_locus, Site, Country)%>% count()
## # A tibble: 10 × 5
## # Groups:   ST, K_locus, Site, Country [10]
##    ST     K_locus  Site Country          n
##    <chr>  <chr>   <dbl> <chr>        <int>
##  1 ST17   KL25       30 South Africa     2
##  2 ST17   KL25       31 South Africa     3
##  3 ST17   KL25       32 South Africa     4
##  4 ST17   KL25       33 South Africa     7
##  5 ST17   KL25       34 South Africa     2
##  6 ST17   KL25       35 South Africa     2
##  7 ST17   KL25       45 South Africa     9
##  8 ST17   KL25       59 Botswana         8
##  9 ST2621 KL25       33 South Africa     1
## 10 ST3184 KL25       35 South Africa     1
# top-5 coverage
obayes_raw_coverage %>% filter(rank==5)
## # A tibble: 5 × 7
##    rank  mean median lower upper locus subgroup       
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>          
## 1     5 0.855  0.855 0.819 0.893 O2a   Global         
## 2     5 0.919  0.919 0.871 0.968 O2a   Eastern Africa 
## 3     5 0.757  0.757 0.688 0.829 O2a   Southern Asia  
## 4     5 0.833  0.830 0.664 1.02  O2a   Western Africa 
## 5     5 0.774  0.773 0.676 0.874 O2a   Southern Africa
obayes_raw_coverage_subgroups %>% filter(rank==5)
## # A tibble: 4 × 8
##    rank  mean median lower upper locus type  subgroup
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <chr> <fct>   
## 1     5 0.865  0.865 0.828 0.903 O2a   min10 ESBL    
## 2     5 0.774  0.774 0.704 0.844 O2a   min10 CP      
## 3     5 0.897  0.896 0.813 0.985 O2a   min10 Fatal   
## 4     5 0.855  0.855 0.819 0.893 O2a   <NA>  All

Table S5 - O coverage stats for all/ESBL/CP/fatal, for global and regional

obayes_fatal_raw_min10perSite_coverageRegional <- getCumulativeCoverageGlobalRegional(ranks=obayes$locus_rank, bayes=obayes_fatal_raw_min10perSite, maxRank=10)

obayes_esbl_raw_min10perSite_coverageRegional <- getCumulativeCoverageGlobalRegional(ranks=obayes$locus_rank, bayes=obayes_esbl_raw_min10perSite, maxRank=10)

obayes_cp_raw_min10perSite_coverageRegional <- getCumulativeCoverageGlobalRegional(ranks=obayes$locus_rank, bayes=obayes_cp_raw_min10perSite, maxRank=10)

tableS5 <- obayes_raw_coverage %>% filter(rank <= 10) %>% mutate(cases="All") %>%
  bind_rows(obayes_fatal_raw_min10perSite_coverageRegional %>% mutate(cases="  Fatal infections")) %>%
  bind_rows(obayes_esbl_raw_min10perSite_coverageRegional %>% mutate(cases="  ESBL infections")) %>%
  bind_rows(obayes_cp_raw_min10perSite_coverageRegional %>% mutate(cases="  CP infections")) %>%
  mutate(upper=if_else(upper>1, 1, upper)) %>%
  mutate(summary=paste0(round(mean,3)*100," (",round(lower,3)*100,"-",round(upper,3)*100,")")) %>% 
  select(locus, subgroup, cases, summary) %>% 
  pivot_wider(names_from=locus, values_from=summary, id_cols=c(subgroup, cases)) %>%
  rename(Region=subgroup)

add totals per subgroup

tableS5 <- esbl_counts %>% mutate(cases="  ESBL infections") %>%
  bind_rows(cp_counts %>% mutate(cases="  CP infections")) %>%
  bind_rows(fatal_counts %>% mutate(cases="  Fatal infections")) %>%
  bind_rows(total_counts %>% mutate(cases="All")) %>%
  full_join(tableS5, by=c("Region", "cases")) %>%
  mutate(cases=fct_relevel(cases,c("All", "  Fatal infections", "  ESBL infections", "  CP infections")))

# sort function isn't working when ordering region factor using fct_relevel
tableS5$Region <- factor(tableS5$Region, levels = c("Global", "Eastern Africa", "Southern Africa", "Western Africa", "Southern Asia"))

tableS5 <- tableS5 %>%
  arrange(Region, cases) %>%
  relocate(n, .after=cases) %>% 
  mutate(cases=if_else(cases=="All", Region, cases)) %>%
  rename(N=n) %>%
  rename(Data=cases) %>%
  ungroup %>% select(-Region)
  
write_tsv(tableS5, file="tables/TableS5_coverage.tsv")

numbers for text - effect of cluster adjustment

kbayes_global %>% mutate(diff=mean.raw-mean.adj) %>% select(locus, mean.raw, mean.adj, diff, rank.raw, rank.adj) %>% arrange(-diff)
## # A tibble: 90 × 6
##    locus mean.raw mean.adj    diff rank.raw rank.adj
##    <chr>    <dbl>    <dbl>   <dbl>    <int>    <int>
##  1 KL102  0.181    0.0872  0.0933         1        2
##  2 KL15   0.0933   0.0543  0.0390         3        4
##  3 KL2    0.120    0.0897  0.0301         2        1
##  4 KL104  0.0341   0.00880 0.0253         6       30
##  5 KL157  0.0154   0.00126 0.0142        14       87
##  6 KL81   0.0256   0.0126  0.0129         9       25
##  7 KL108  0.0219   0.0101  0.0118        11       28
##  8 KL149  0.0267   0.0215  0.00514        8       13
##  9 KL127  0.00586  0.00380 0.00206       34       50
## 10 KL145  0.00321  0.00128 0.00194       38       72
## # ℹ 80 more rows
data_NNS_sites10 %>% filter(K_locus=="KL102") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL102") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 69 × 6
##    Cluster   Country      ST         n      p   cum
##    <chr>     <chr>        <chr>  <int>  <dbl> <dbl>
##  1 SZ1       Zambia       ST307    203 0.599  0.599
##  2 CHRF187   Bangladesh   ST147     21 0.0619 0.661
##  3 SZ3       Zambia       ST307     11 0.0324 0.693
##  4 Kilifi141 Kenya        ST3247     8 0.0236 0.717
##  5 BG3       South Africa ST307      5 0.0147 0.732
##  6 CHRF120   Bangladesh   ST307      5 0.0147 0.746
##  7 MB31      Ghana        ST307      5 0.0147 0.761
##  8 BG33      South Africa ST307      4 0.0118 0.773
##  9 ML65      Malawi       ST307      4 0.0118 0.785
## 10 SZ30      Zambia       ST307      4 0.0118 0.796
## # ℹ 59 more rows
data_NNS_sites10 %>% filter(K_locus=="KL2") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL2") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 71 × 6
##    Cluster                      Country ST        n       p   cum
##    <chr>                        <chr>   <chr> <int>   <dbl> <dbl>
##  1 ML2                          Malawi  ST39     87 0.387   0.387
##  2 ML9                          Malawi  ST14     23 0.102   0.489
##  3 Kiambu sub-County Hospital30 Kenya   ST14     12 0.0533  0.542
##  4 Mbagathi54                   Kenya   ST14     11 0.0489  0.591
##  5 Kilifi86                     Kenya   ST14      7 0.0311  0.622
##  6 ML1                          Malawi  ST14      6 0.0267  0.649
##  7 ML29                         Malawi  ST25      4 0.0178  0.667
##  8 ML3                          Malawi  ST25      4 0.0178  0.684
##  9 Kilifi125                    Kenya   ST101     3 0.0133  0.698
## 10 MB85                         Malawi  ST25      2 0.00889 0.707
## # ℹ 61 more rows
data_NNS_sites10 %>% filter(K_locus=="KL15") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL15") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 41 × 6
##    Cluster Country    ST         n      p   cum
##    <chr>   <chr>      <chr>  <int>  <dbl> <dbl>
##  1 CHRF149 Bangladesh ST11      66 0.377  0.377
##  2 BA24    Ethiopia   ST37      26 0.149  0.526
##  3 CHRF169 Bangladesh ST502     14 0.08   0.606
##  4 NEO2    India      ST11       9 0.0514 0.657
##  5 ML94    Malawi     ST45       7 0.04   0.697
##  6 aiims10 India      ST11       6 0.0343 0.731
##  7 MB49    Zambia     ST5856     4 0.0229 0.754
##  8 BA20    Ethiopia   ST37       3 0.0171 0.771
##  9 CHRF143 Bangladesh ST273      3 0.0171 0.789
## 10 CHRF186 Bangladesh ST340      2 0.0114 0.8  
## # ℹ 31 more rows
data_NNS_sites10 %>% filter(K_locus=="KL104") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL104") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 8 × 6
##   Cluster   Country  ST             n      p   cum
##   <chr>     <chr>    <chr>      <int>  <dbl> <dbl>
## 1 MB98      Tanzania ST1741        50 0.781  0.781
## 2 Kilifi117 Kenya    ST1741         5 0.0781 0.859
## 3 aiims21   India    ST1741         3 0.0469 0.906
## 4 aiims13   India    ST1741         2 0.0312 0.938
## 5 MB94      Tanzania ST1741         1 0.0156 0.953
## 6 MB96      Tanzania ST1741         1 0.0156 0.969
## 7 MB98      Tanzania ST1741-1LV     1 0.0156 0.984
## 8 ML46      Malawi   ST245-2LV      1 0.0156 1
data_NNS_sites10 %>% filter(K_locus=="KL157") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL157") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 1 × 6
##   Cluster                      Country ST         n     p   cum
##   <chr>                        <chr>   <chr>  <int> <dbl> <dbl>
## 1 Kiambu sub-County Hospital42 Kenya   ST6775    29     1     1
data_NNS_sites10 %>% filter(K_locus=="KL81") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL81") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 9 × 6
##   Cluster Country    ST        n      p   cum
##   <chr>   <chr>      <chr> <int>  <dbl> <dbl>
## 1 CHRF182 Bangladesh ST16     28 0.583  0.583
## 2 AKU15   Pakistan   ST16      7 0.146  0.729
## 3 CHRF126 Bangladesh ST16      6 0.125  0.854
## 4 AKU69   Pakistan   ST16      2 0.0417 0.896
## 5 AKU39   Pakistan   ST16      1 0.0208 0.917
## 6 CHRF165 Bangladesh ST16      1 0.0208 0.938
## 7 CHRF167 Bangladesh ST16      1 0.0208 0.958
## 8 CHRF176 Bangladesh ST16      1 0.0208 0.979
## 9 CHRF178 Bangladesh ST16      1 0.0208 1
data_NNS_sites10 %>% filter(K_locus=="KL108") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL108") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 8 × 6
##   Cluster Country      ST        n      p   cum
##   <chr>   <chr>        <chr> <int>  <dbl> <dbl>
## 1 BA23    Ethiopia     ST35     32 0.780  0.780
## 2 BA30    Ethiopia     ST35      3 0.0732 0.854
## 3 BA25    Ethiopia     ST35      1 0.0244 0.878
## 4 BA26    Ethiopia     ST35      1 0.0244 0.902
## 5 BA32    Ethiopia     ST35      1 0.0244 0.927
## 6 BA73    Nigeria      ST395     1 0.0244 0.951
## 7 ML149   Malawi       ST110     1 0.0244 0.976
## 8 WI35    South Africa ST35      1 0.0244 1
data_NNS_sites10 %>% filter(K_locus=="KL149") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL149") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 17 × 6
##    Cluster Country      ST        n     p   cum
##    <chr>   <chr>        <chr> <int> <dbl> <dbl>
##  1 BG27    South Africa ST152    16  0.32  0.32
##  2 WI20    South Africa ST39     10  0.2   0.52
##  3 BG77    South Africa ST39      5  0.1   0.62
##  4 CHRF139 Bangladesh   ST152     3  0.06  0.68
##  5 BG16    South Africa ST152     2  0.04  0.72
##  6 BG37    South Africa ST152     2  0.04  0.76
##  7 WI43    South Africa ST39      2  0.04  0.8 
##  8 BG40    South Africa ST39      1  0.02  0.82
##  9 BG72    South Africa ST39      1  0.02  0.84
## 10 BG84    South Africa ST152     1  0.02  0.86
## 11 BG93    South Africa ST39      1  0.02  0.88
## 12 BG96    South Africa ST39      1  0.02  0.9 
## 13 MB70    Ethiopia     ST152     1  0.02  0.92
## 14 WI15    South Africa ST152     1  0.02  0.94
## 15 WI17    South Africa ST39      1  0.02  0.96
## 16 WI25    South Africa ST39      1  0.02  0.98
## 17 WI36    South Africa ST152     1  0.02  1
# how do these compare to those added to KL set 2 to get good region coverage? - only KL81
k_rank_region8 %>% left_join(kbayes$locus, by="locus") %>% filter(rank.y>20)
## # A tibble: 5 × 3
##   locus rank.x rank.y
##   <chr>  <int>  <int>
## 1 KL28      16     22
## 2 KL81      17     25
## 3 KL8       18     27
## 4 KL116     19     35
## 5 KL53      20     41

Leave One out (LOO)

Sensitivity analysis - O type, Bayesian estimate, adjusted

loo <- obayes$global_est %>% mutate(Study="None")

loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_AKU_global_estimates.csv") %>% 
  mutate(Study="AKU") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_BabyGERMS_global_estimates.csv") %>% 
  mutate(Study="BabyGERMS") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_BARNARDS_global_estimates.csv") %>% 
  mutate(Study="BARNARDS") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 14 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_Botswana_global_estimates.csv") %>% 
  mutate(Study="Botswana") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_CHRF_global_estimates.csv") %>% 
  mutate(Study="CHRF") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 14 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_AIIMS_global_estimates.csv") %>% 
  mutate(Study="DH") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_GBS_global_estimates.csv") %>% 
  mutate(Study="GBSCOP") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_Kilifi_global_estimates.csv") %>% 
  mutate(Study="Kilifi") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 14 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_MBIRA_global_estimates.csv") %>% 
  mutate(Study="MBIRA") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_MLW_global_estimates.csv") %>% 
  mutate(Study="MLW") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 14 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_NeoBAC_global_estimates.csv") %>% 
  mutate(Study="NeoBAC") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_NeoOBS_global_estimates.csv") %>% 
  mutate(Study="NeoOBS-India") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo_O <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_SPINZ_global_estimates.csv") %>% 
  mutate(Study="SPINZ") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo) %>%
  mutate(locus=replace(locus, locus=="O2a.v1", "O2a")) %>%
  mutate(locus=replace(locus, locus=="O2afg.v2", "O2afg"))
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Bayes global sensitivity analysis, top20 in any, print rank - O type

loo_O_plot <- loo_O %>% 
  filter(rank<=9) %>%
  ggplot(aes(y=locus, x=Study, fill=factor(rank))) +
  geom_tile() + 
  geom_text(aes(y=locus, x=Study, label=round(rank)), size=4) +
  scale_y_discrete(limits=rev(obayes$locus_rank$locus[1:9])) + 
  scale_x_discrete(limits=rev(unique(loo_O$Study))) + # keep the order as per tibble, starting with 'None'
  #scale_fill_gradient("Global\nprevalence (%)", low = "white", high = "#081d58")
  scale_fill_manual(values = c(rev(brewer.pal(5, "Reds")), rev(brewer.pal(6, "Blues")[-1]))) +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, 
                                   colour = "black"), 
        axis.title = element_text(size = 10, colour = "black"),
        axis.text.y = element_text(size = 10, colour = "black"),
        plot.title = element_text(hjust=0.5),
        legend.position = "right") +
  labs(y="", x="Excluded study", title="Sensitivity analysis - O types", fill="Rank")

Sensitivity analysis - K locus, Bayesian estimate, adjusted

loo <- kbayes$global_est %>% mutate(Study="None")

loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_AKU_global_estimates.csv") %>% 
  mutate(Study="AKU") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 87 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_BabyGERMS_global_estimates.csv") %>% 
  mutate(Study="BabyGERMS") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 89 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_BARNARDS_global_estimates.csv") %>% 
  mutate(Study="BARNARDS") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 85 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_Botswana_global_estimates.csv") %>% 
  mutate(Study="Botswana") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 90 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_CHRF_global_estimates.csv") %>% 
  mutate(Study="CHRF") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 84 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_AIIMS_global_estimates.csv") %>% 
  mutate(Study="DH") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 89 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_GBS_global_estimates.csv") %>% 
  mutate(Study="GBSCOP") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 90 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_Kilifi_global_estimates.csv") %>% 
  mutate(Study="Kilifi") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 85 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_MBIRA_global_estimates.csv") %>% 
  mutate(Study="MBIRA") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 88 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_MLW_global_estimates.csv") %>% 
  mutate(Study="MLW") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 87 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_NeoBAC_global_estimates.csv") %>% 
  mutate(Study="NeoBAC") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 89 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_NeoOBS_global_estimates.csv") %>% 
  mutate(Study="NeoOBS-India") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 90 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_SPINZ_global_estimates.csv") %>% 
  mutate(Study="SPINZ") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo) 
## Rows: 90 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

sensitivity analysis for K, top20 in any, print rank

top20_sens <- loo %>% filter(rank<=20) %>% pull(locus) %>% unique()

loo_k_ranks <- loo %>% 
  filter(rank<=20) %>%
  ggplot(aes(y=locus, x=Study, fill=factor(rank))) +
  geom_tile() + 
  geom_text(aes(y=locus, x=Study, label=round(rank)), size=4) +
  scale_y_discrete(limits=rev(kbayes$locus_rank$locus[kbayes$locus_rank$locus %in% top20_sens])) + 
  scale_x_discrete(limits=rev(unique(loo$Study))) + # keep the order as per tibble, starting with 'None'
  #scale_fill_gradient("Global\nprevalence (%)", low = "white", high = "#081d58")
  scale_fill_manual(values = c(rev(brewer.pal(5, "Reds")), rev(brewer.pal(6, "Purples")[-1]), rev(brewer.pal(6, "Blues")[-1]), rev(brewer.pal(5, "Greens")))) +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, 
                                   colour = "black"), 
        axis.title = element_text(size = 10, colour = "black"),
        axis.text.y = element_text(size = 10, colour = "black"),
        plot.title = element_text(hjust=0.5),
        legend.position = "right") +
  labs(y="", x="Excluded study", title="Sensitivity analysis", fill="Rank") +
  geom_hline(yintercept=length(top20_sens)+0.5-5) +
  geom_hline(yintercept=length(top20_sens)+0.5-10) +
  geom_hline(yintercept=length(top20_sens)+0.5-15) +
  geom_hline(yintercept=length(top20_sens)+0.5-20)

LOO - K coverage

studies <- unique(data_NNS_sites10$Study)
studies <- studies[-c(8,12)]
studies <- c(studies, c("AIIMS", "NeoOBS"))

loo_k_coverage <- tibble(
    rank = integer(0),
    mean = double(0),
    median = double(0),
    lower = double(0),
    upper = double(0),
    locus = character(0),
    subgroup = character(0),
    Study = character(0)
  )

for (study in studies) {

# read raw Bayesian estimates for K
loo_kbayes_raw <- parseModelledEstimates(global_post=paste0("outputs_LOO/K_Full_min10_raw_28_LOO_",study,"_posterior_global.csv.gz"), region_post=paste0("outputs_LOO/K_Full_min10_raw_28_LOO_",study,"_posterior_subgroup.csv.gz"))

this_coverage <- getCumulativeCoverageGlobalRegional(loo_kbayes_raw, kbayes$locus_rank) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa"))) %>%
  mutate(Study=study)

loo_k_coverage <- bind_rows(loo_k_coverage, this_coverage)
}
## Rows: 1068000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4272000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1044000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4176000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1008000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4032000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1020000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1020000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1068000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4272000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1044000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4176000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1056000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4224000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1068000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4272000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
loo_k_coverage_plot <- loo_k_coverage %>% 
  mutate(subgroup=fct_relevel(subgroup,rev(c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))) %>%
  bind_rows(kbayes_raw_coverage %>% mutate(Study="None")) %>% 
  filter(rank==20) %>% 
  mutate(Study=if_else(Study=="NeoOBS", "NeoOBS-India", Study)) %>%
  mutate(Study=if_else(Study=="AIIMS", "DH", Study)) %>%
  mutate(Study=if_else(Study=="GBS", "GBSCOP", Study)) %>%
  ggplot(aes(x=Study, y=subgroup, fill=mean)) + 
    geom_tile() + 
    geom_text(aes(y=subgroup, x=Study, label=round(100*mean, 2)), size=3) +
    scale_x_discrete(limits=rev(unique(loo$Study))) + 
    scale_fill_gradient(low="white", high="grey") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, 
                                     colour = "black"), 
          axis.title = element_text(size = 10, colour = "black"),
          axis.text.y = element_text(size = 10, colour = "black"),
          plot.title = element_text(hjust=0.5),
          legend.position = "right") +
    labs(y="", x="Excluded study", fill="Coverage")

Appendix Fig S2.1 - LOO K

loo_k_ranks / loo_k_coverage_plot + patchwork::plot_layout(heights=c(4,1), axes="collect")

ggsave(file="figures/AppendixFigS2.1_LOO_BayesGlobal_Klocus.pdf", width=9, height=11)
ggsave(file="figures/AppendixFigS2.1_LOO_BayesGlobal_Klocus.png", width=9, height=11)
loo_o_coverage <- tibble(
    rank = integer(0),
    mean = double(0),
    median = double(0),
    lower = double(0),
    upper = double(0),
    locus = character(0),
    subgroup = character(0),
    Study = character(0)
  )

for (study in studies) {

# read raw Bayesian estimates for K
loo_obayes_raw <- parseModelledEstimates(global_post=paste0("outputs_LOO/O_Full_min10_raw_28_LOO_",study,"_posterior_global.csv.gz"), region_post=paste0("outputs_LOO/O_Full_min10_raw_28_LOO_",study,"_posterior_subgroup.csv.gz"), fixOnames = T)

this_coverage <- getCumulativeCoverageGlobalRegional(loo_obayes_raw, obayes$locus_rank) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa"))) %>%
  mutate(Study=study)

loo_o_coverage <- bind_rows(loo_o_coverage, this_coverage)
}
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 168000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 672000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 168000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 672000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 168000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 672000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 168000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 672000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
loo_o_coverage_plot <- loo_o_coverage %>% 
  mutate(subgroup=fct_relevel(subgroup,rev(c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))) %>%
  bind_rows(obayes_raw_coverage %>% mutate(Study="None")) %>% 
  filter(rank==5) %>% 
  mutate(Study=if_else(Study=="NeoOBS", "NeoOBS-India", Study)) %>%
  mutate(Study=if_else(Study=="AIIMS", "DH", Study)) %>%
  mutate(Study=if_else(Study=="GBS", "GBSCOP", Study)) %>%
  ggplot(aes(x=Study, y=subgroup, fill=mean)) + 
    geom_tile() + 
    geom_text(aes(y=subgroup, x=Study, label=round(100*mean, 2)), size=3) +
    scale_x_discrete(limits=rev(unique(loo$Study))) + 
    scale_fill_gradient(low="lightgrey", high="darkgrey") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, 
                                     colour = "black"), 
          axis.title = element_text(size = 10, colour = "black"),
          axis.text.y = element_text(size = 10, colour = "black"),
          plot.title = element_text(hjust=0.5),
          legend.position = "right") +
    labs(y="", x="Excluded study", fill="Coverage")

Appendix Fig S2.2

loo_O_plot / loo_o_coverage_plot + plot_layout(heights=c(2,1), axes="collect")

ggsave(file="figures/AppendixFigS2.2_LOO_BayesGlobal_Olocus.pdf", width=9, height=9)
ggsave(file="figures/AppendixFigS2.2_LOO_BayesGlobal_Olocus.png", width=9, height=9)

EFFECT OF TEMPORAL THRESHOLD FOR CLUSTERING ON MODELLED ESTIMATES

compare global prevalence distributions using counts cluster-adjusted using 28 vs 365-day threshold

# read Bayesian estimates for K adjusted using 365 days
kbayes_365 <- parseModelledEstimates(global_post="outputs_core/K_Full_min10_adj_365_posterior_global.csv.gz", region_post="outputs_core/K_Full_min10_adj_28_posterior_subgroup.csv.gz")
## Rows: 1092000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
# plot raw vs adjusted distributions, overlaid - for fig S15
k_dist_28_365 <- comparative_locus_ridgesplot(posterior1=kbayes$global_post,
                             posterior2=kbayes_365$global_post,
                             ranks=kbayes$locus_rank, 
                             lines_every=10, xlim=c(0,12), xbreaks=seq(0,12,1),
                             maxRank=30, type1="28 days", type2="365 days", legend_title="Cluster threshold", scale=2, title="e) Modelled K estimates - posterior")
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0765
## Warning: Removed 99 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

# read Bayesian estimates for O adjusted using 365 days
obayes_365 <- parseModelledEstimates(global_post="outputs_core/O_Full_min10_adj_365_posterior_global.csv.gz", region_post="outputs_core/O_Full_min10_adj_365_posterior_subgroup.csv.gz", fixOnames = T)
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
# plot raw vs adjusted distributions, overlaid - for fig S15
o_dist_28_365 <- comparative_locus_ridgesplot(posterior1=obayes$global_post,
                             posterior2=obayes_365$global_post,
                             ranks=obayes$locus_rank, 
                             lines_every=5, xlim=c(0,30), xbreaks=seq(0,30,2),
                             maxRank=15, type1="28 days", type2="365 days", legend_title="Cluster threshold", scale=2, title="f) Modelled O estimates - posterior")
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0965
## Warning: Removed 126 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

Scatter plots for K mean and rank

adj_28_vs_365 <- full_join(kbayes$global_est %>% arrange(-mean) %>% ungroup %>% mutate(rank=row_number()), 
          kbayes_365$global_est %>% arrange(-mean) %>% ungroup() %>% mutate(rank=row_number()),
          by="locus", suffix=c(".28", ".365")) 

timeThreshold_estimates_adj <- adj_28_vs_365 %>%
  ggplot(aes(x=mean.28, y=mean.365)) + 
  geom_abline(intercept=0, slope=1, col="grey") +
  geom_point() + theme_bw() + 
  labs(x="28-day threshold", y="365-day threshold", title="c) Mean K estimates")

timeThreshold_ranks_adj <- adj_28_vs_365 %>%
  ggplot(aes(x=rank.28, y=rank.365)) + 
  geom_abline(intercept=0, slope=1, col="grey") +
  geom_point() + theme_bw()  + 
  labs(x="28-day threshold", y="365-day threshold", title="a) Modelled K ranks")

timeThreshold_estimates_adj + timeThreshold_ranks_adj
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

Scatter plots for O mean and rank

adj_28_vs_365_o <- full_join(obayes$global_est %>% arrange(-mean) %>% ungroup %>% mutate(rank=row_number()), 
          obayes_365$global_est %>% arrange(-mean) %>% ungroup() %>% mutate(rank=row_number()),
          by="locus", suffix=c(".28", ".365")) 

timeThreshold_estimates_adj_o <- adj_28_vs_365_o %>%
  ggplot(aes(x=mean.28, y=mean.365)) + 
  geom_abline(intercept=0, slope=1, col="grey") +
  geom_point() + theme_bw() + 
  labs(x="28-day threshold", y="365-day threshold", title="d) Mean O estimates")

timeThreshold_ranks_adj_o <- adj_28_vs_365_o %>%
  ggplot(aes(x=rank.28, y=rank.365)) + 
  geom_abline(intercept=0, slope=1, col="grey") +
  geom_point() + theme_bw()  + 
  labs(x="28-day threshold", y="365-day threshold", title="b) Modelled O ranks")

timeThreshold_estimates_adj_o + timeThreshold_ranks_adj_o

Appendix Fig S2.5

(timeThreshold_ranks_adj + timeThreshold_ranks_adj_o) / (timeThreshold_estimates_adj + timeThreshold_estimates_adj_o) / (k_dist_28_365 + o_dist_28_365 + patchwork::plot_layout(guides="collect")) + patchwork::plot_layout(heights=c(1,1,2)) & theme(legend.position='bottom')
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Picking joint bandwidth of 0.0765
## Warning: Removed 99 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Picking joint bandwidth of 0.0965
## Warning: Removed 126 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

ggsave("figures/AppendixFigS2.5_temporalThresholdModelled.pdf", width=8, height=10)
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Picking joint bandwidth of 0.0765
## Warning: Removed 99 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Picking joint bandwidth of 0.0965
## Warning: Removed 126 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
ggsave("figures/AppendixFigS2.5_temporalThresholdModelled.png", width=8, height=10)
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Picking joint bandwidth of 0.0765
## Warning: Removed 99 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Picking joint bandwidth of 0.0965
## Warning: Removed 126 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).